From cafddc22ea4bf19d237083102a6a369aa5b37115 Mon Sep 17 00:00:00 2001
From: rudrabeniwal <rudrabeniwal86@gmail.com>
Date: Thu, 22 May 2025 19:17:40 +0530
Subject: [PATCH 1/4] Modified the execute method signature in AbstractExecutor
 and SequenceExecutor to accept a Seq of Buffer for input and expect a Buffer
 for output. Also removed MapExecutor

---
 .../vulkan/executor/AbstractExecutor.scala    | 35 +++++-----
 .../cyfra/vulkan/executor/MapExecutor.scala   | 64 -------------------
 .../vulkan/executor/SequenceExecutor.scala    | 32 ++++------
 3 files changed, 29 insertions(+), 102 deletions(-)
 delete mode 100644 cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/MapExecutor.scala

diff --git a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/AbstractExecutor.scala b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/AbstractExecutor.scala
index f5e8a368..37e1dacc 100644
--- a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/AbstractExecutor.scala
+++ b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/AbstractExecutor.scala
@@ -7,11 +7,10 @@ import io.computenode.cyfra.vulkan.core.Device
 import io.computenode.cyfra.vulkan.memory.{Allocator, Buffer, DescriptorPool, DescriptorSet}
 import org.lwjgl.BufferUtils
 import org.lwjgl.util.vma.Vma.VMA_MEMORY_USAGE_UNKNOWN
+import org.lwjgl.util.vma.Vma.VMA_MEMORY_USAGE_GPU_TO_CPU
 import org.lwjgl.vulkan.*
 import org.lwjgl.vulkan.VK10.*
 
-import java.nio.ByteBuffer
-
 private[cyfra] abstract class AbstractExecutor(dataLength: Int, val bufferActions: Seq[BufferAction], context: VulkanContext) {
   protected val device: Device = context.device
   protected val queue: Queue = context.computeQueue
@@ -37,18 +36,11 @@ private[cyfra] abstract class AbstractExecutor(dataLength: Int, val bufferAction
       commandBuffer
     }
 
-  def execute(input: Seq[ByteBuffer]): Seq[ByteBuffer] = {
-    val stagingBuffer = new Buffer(
-      getBiggestTransportData * dataLength,
-      VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
-      VMA_MEMORY_USAGE_UNKNOWN,
-      allocator
-    )
+  def execute(input: Seq[Buffer]): Seq[Buffer] = {
     for (i <- bufferActions.indices if bufferActions(i) == BufferAction.LoadTo) do {
-      val buffer = input(i)
-      Buffer.copyBuffer(buffer, stagingBuffer, buffer.remaining())
-      Buffer.copyBuffer(stagingBuffer, buffers(i), buffer.remaining(), commandPool).block().destroy()
+      val inputHostBuffer = input(i)
+      val gpuDeviceBuffer = buffers(i)
+      Buffer.copyBuffer(inputHostBuffer, gpuDeviceBuffer, inputHostBuffer.size, commandPool).block().destroy()
     }
 
     pushStack { stack =>
@@ -64,14 +56,17 @@ private[cyfra] abstract class AbstractExecutor(dataLength: Int, val bufferAction
     }
 
     val output = for (i <- bufferActions.indices if bufferActions(i) == BufferAction.LoadFrom) yield {
-      val fence = Buffer.copyBuffer(buffers(i), stagingBuffer, buffers(i).size, commandPool)
-      val outBuffer = BufferUtils.createByteBuffer(buffers(i).size)
-      fence.block().destroy()
-      Buffer.copyBuffer(stagingBuffer, outBuffer, outBuffer.remaining())
-      outBuffer
-
+      val gpuDeviceBuffer = buffers(i)
+      val outputHostBuffer = new Buffer(
+        gpuDeviceBuffer.size,
+        VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+        VMA_MEMORY_USAGE_GPU_TO_CPU,
+        allocator
+      )
+      Buffer.copyBuffer(gpuDeviceBuffer, outputHostBuffer, gpuDeviceBuffer.size, commandPool).block().destroy()
+      outputHostBuffer
     }
-    stagingBuffer.destroy()
     output
   }
 
diff --git a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/MapExecutor.scala b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/MapExecutor.scala
deleted file mode 100644
index aedc82a4..00000000
--- a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/MapExecutor.scala
+++ /dev/null
@@ -1,64 +0,0 @@
-package io.computenode.cyfra.vulkan.executor
-
-import io.computenode.cyfra.vulkan.compute.*
-import io.computenode.cyfra.vulkan.VulkanContext
-import io.computenode.cyfra.vulkan.compute.{Binding, ComputePipeline, InputBufferSize, Shader, UniformSize}
-import io.computenode.cyfra.vulkan.memory.{Buffer, DescriptorSet}
-import io.computenode.cyfra.vulkan.util.Util.{check, pushStack}
-import org.lwjgl.system.MemoryStack
-import org.lwjgl.system.MemoryStack.stackPush
-import org.lwjgl.util.vma.Vma.*
-import org.lwjgl.vulkan.*
-import org.lwjgl.vulkan.VK10.*
-
-import scala.collection.mutable
-import scala.util.Using
-
-/** @author
-  *   MarconZet Created 15.04.2020
-  */
-private[cyfra] class MapExecutor(dataLength: Int, bufferActions: Seq[BufferAction], computePipeline: ComputePipeline, context: VulkanContext)
-    extends AbstractExecutor(dataLength, bufferActions, context) {
-  private lazy val shader: Shader = computePipeline.computeShader
-
-  protected def getBiggestTransportData: Int = shader.layoutInfo.sets
-    .flatMap(_.bindings)
-    .collect { case Binding(_, InputBufferSize(n)) =>
-      n
-    }
-    .max
-
-  protected def setupBuffers(): (Seq[DescriptorSet], Seq[Buffer]) = pushStack { stack =>
-    val bindings = shader.layoutInfo.sets.flatMap(_.bindings)
-    val buffers = bindings.zipWithIndex.map { case (binding, i) =>
-      val bufferSize = binding.size match {
-        case InputBufferSize(n) => n * dataLength
-        case UniformSize(n)     => n
-      }
-      new Buffer(bufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | bufferActions(i).action, 0, VMA_MEMORY_USAGE_GPU_ONLY, allocator)
-    }
-
-    val bufferDeque = mutable.ArrayDeque.from(buffers)
-    val descriptorSetLayouts = computePipeline.descriptorSetLayouts
-    val descriptorSets = for (i <- descriptorSetLayouts.indices) yield {
-      val descriptorSet = new DescriptorSet(device, descriptorSetLayouts(i)._1, descriptorSetLayouts(i)._2.bindings, descriptorPool)
-      val size = descriptorSetLayouts(i)._2.bindings.size
-      descriptorSet.update(bufferDeque.take(size).toSeq)
-      bufferDeque.drop(size)
-      descriptorSet
-    }
-    (descriptorSets, buffers)
-  }
-
-  protected def recordCommandBuffer(commandBuffer: VkCommandBuffer): Unit =
-    pushStack { stack =>
-      vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipeline.get)
-
-      val pDescriptorSets = stack.longs(descriptorSets.map(_.get): _*)
-      vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, computePipeline.pipelineLayout, 0, pDescriptorSets, null)
-
-      val workgroup = shader.workgroupDimensions
-      vkCmdDispatch(commandBuffer, dataLength / workgroup.x(), 1 / workgroup.y(), 1 / workgroup.z())
-    }
-
-}
diff --git a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/SequenceExecutor.scala b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/SequenceExecutor.scala
index 8945893b..7c85ac52 100644
--- a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/SequenceExecutor.scala
+++ b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/executor/SequenceExecutor.scala
@@ -149,7 +149,7 @@ private[cyfra] class SequenceExecutor(computeSequence: ComputationSequence, cont
     setToBuffers
   }
 
-  def execute(inputs: Seq[ByteBuffer], dataLength: Int): Seq[ByteBuffer] = pushStack { stack =>
+  def execute(inputs: Seq[Buffer], dataLength: Int): Seq[Buffer] = pushStack { stack =>
     timed("Vulkan full execute"):
       val setToBuffers = createBuffers(dataLength)
 
@@ -160,17 +160,9 @@ private[cyfra] class SequenceExecutor(computeSequence: ComputationSequence, cont
           }
         }.flatten
 
-      val stagingBuffer = new Buffer(
-        inputs.map(_.remaining()).max,
-        VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
-        VMA_MEMORY_USAGE_UNKNOWN,
-        allocator
-      )
-
-      buffersWithAction(BufferAction.LoadTo).zipWithIndex.foreach { case (buffer, i) =>
-        Buffer.copyBuffer(inputs(i), stagingBuffer, buffer.size)
-        Buffer.copyBuffer(stagingBuffer, buffer, buffer.size, commandPool).block().destroy()
+      buffersWithAction(BufferAction.LoadTo).zipWithIndex.foreach { case (gpuDeviceBuffer, i) =>
+        val inputHostBuffer = inputs(i)
+        Buffer.copyBuffer(inputHostBuffer, gpuDeviceBuffer, inputHostBuffer.size, commandPool).block().destroy()
       }
 
       val fence = new Fence(device)
@@ -185,14 +177,18 @@ private[cyfra] class SequenceExecutor(computeSequence: ComputationSequence, cont
         check(vkQueueSubmit(queue.get, submitInfo, fence.get), "Failed to submit command buffer to queue")
         fence.block().destroy()
 
-      val output = buffersWithAction(BufferAction.LoadFrom).map { buffer =>
-        Buffer.copyBuffer(buffer, stagingBuffer, buffer.size, commandPool).block().destroy()
-        val out = BufferUtils.createByteBuffer(buffer.size)
-        Buffer.copyBuffer(stagingBuffer, out, buffer.size)
-        out
+      val output = buffersWithAction(BufferAction.LoadFrom).map { gpuDeviceBuffer =>
+        val outputHostBuffer = new Buffer(
+          gpuDeviceBuffer.size,
+          VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+          VMA_MEMORY_USAGE_GPU_TO_CPU, 
+          allocator
+        )
+        Buffer.copyBuffer(gpuDeviceBuffer, outputHostBuffer, gpuDeviceBuffer.size, commandPool).block().destroy()
+        outputHostBuffer
       }
 
-      stagingBuffer.destroy()
       commandPool.freeCommandBuffer(commandBuffer)
       setToBuffers.keys.foreach(_.update(Seq.empty))
       setToBuffers.flatMap(_._2).foreach(_.destroy())

From fc183b7dd0a7983cd86ef43a35cfea683ceaffd2 Mon Sep 17 00:00:00 2001
From: rudrabeniwal <rudrabeniwal86@gmail.com>
Date: Sat, 24 May 2025 04:14:49 +0530
Subject: [PATCH 2/4] Refactor memory management in FloatMem and Vec4FloatMem
 to use Vulkan buffers. Update GMem and RamGMem traits to include Vulkan
 buffer handling and cleanup methods. Introduce mapping and unmapping
 functionality in Buffer class for direct memory access.

Now GContext needs to be updated to align with the changes
---
 .../cyfra/runtime/mem/FloatMem.scala          | 82 ++++++++++++++---
 .../computenode/cyfra/runtime/mem/GMem.scala  |  7 +-
 .../cyfra/runtime/mem/RamGMem.scala           |  7 +-
 .../cyfra/runtime/mem/Vec4FloatMem.scala      | 88 +++++++++++++++----
 .../cyfra/vulkan/memory/Buffer.scala          | 17 ++++
 5 files changed, 166 insertions(+), 35 deletions(-)

diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala
index f8919e12..ade00101 100644
--- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala
+++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala
@@ -1,29 +1,83 @@
 package io.computenode.cyfra.runtime.mem
 
 import io.computenode.cyfra.dsl.Value.Float32
+import io.computenode.cyfra.vulkan.memory.Buffer
+import io.computenode.cyfra.runtime.GContext
+import org.lwjgl.vulkan.VK10.*
+import org.lwjgl.util.vma.Vma.*
 
 import java.nio.ByteBuffer
-import org.lwjgl.system.MemoryUtil
 
-class FloatMem(val size: Int, protected val data: ByteBuffer) extends RamGMem[Float32, Float]:
-  def toArray: Array[Float] =
-    val res = data.asFloatBuffer()
+class FloatMem(val size: Int, val vulkanBuffer: Buffer) extends RamGMem[Float32, Float]:
+  def toArray(using context: GContext): Array[Float] =
+    val allocator = context.vkContext.allocator
+    val commandPool = context.vkContext.commandPool
+    val bufferSize = size.toLong * FloatMem.FloatSize
+
+    val stagingBuffer = new Buffer(
+      bufferSize.toInt,
+      VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+      VMA_MEMORY_USAGE_GPU_TO_CPU,
+      allocator
+    )
+
+    Buffer.copyBuffer(vulkanBuffer, stagingBuffer, bufferSize, commandPool).block().close()
+    
+    val byteBuffer = stagingBuffer.map()
+    val floatBuffer = byteBuffer.asFloatBuffer()
     val result = new Array[Float](size)
-    res.get(result)
+    floatBuffer.get(result)
+    stagingBuffer.unmap()
+    stagingBuffer.destroy()
     result
 
+  def cleanup(): Unit =
+    vulkanBuffer.destroy()
 
 object FloatMem {
   val FloatSize = 4
 
-  def apply(floats: Array[Float]): FloatMem =
+  def apply(floats: Array[Float])(using context: GContext): FloatMem =
     val size = floats.length
-    val data = ByteBuffer.allocateDirect(size * FloatSize)
-    data.asFloatBuffer().put(floats)
-    data.rewind()
-    new FloatMem(size, data)
-
-  def apply(size: Int): FloatMem = 
-    val data = ByteBuffer.allocateDirect(size * FloatSize)
-    new FloatMem(size, data)
+    val bufferSize = size.toLong * FloatSize
+    val allocator = context.vkContext.allocator
+    val commandPool = context.vkContext.commandPool
+
+    val stagingBuffer = new Buffer(
+      bufferSize.toInt,
+      VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+      VMA_MEMORY_USAGE_CPU_ONLY,
+      allocator
+    )
+
+    val byteBuffer = stagingBuffer.map()
+    byteBuffer.asFloatBuffer().put(floats)
+    stagingBuffer.unmap()
+
+    val deviceBuffer = new Buffer(
+      bufferSize.toInt,
+      VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+      0, 
+      VMA_MEMORY_USAGE_GPU_ONLY,
+      allocator
+    )
+
+    Buffer.copyBuffer(stagingBuffer, deviceBuffer, bufferSize, commandPool).block().close()
+    stagingBuffer.destroy()
+
+    new FloatMem(size, deviceBuffer)
+
+  def apply(size: Int)(using context: GContext): FloatMem = 
+    val bufferSize = size.toLong * FloatSize
+    val allocator = context.vkContext.allocator
+    val deviceBuffer = new Buffer(
+      bufferSize.toInt,
+      VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+      0,
+      VMA_MEMORY_USAGE_GPU_ONLY,
+      allocator
+    )
+    new FloatMem(size, deviceBuffer)
 }
diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala
index eb04de4c..01cd8d0f 100644
--- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala
+++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala
@@ -12,16 +12,19 @@ import izumi.reflect.Tag
 import org.lwjgl.system.MemoryUtil
 
 import java.nio.ByteBuffer
-
+import io.computenode.cyfra.vulkan.memory.Buffer
 trait GMem[H <: Value]:
   def size: Int
-  def toReadOnlyBuffer: ByteBuffer
+  def vulkanBuffer: Buffer
   def map[
     G <: GStruct[G] : Tag : GStructSchema,
     R <: Value : FromExpr : Tag
   ](fn: GFunction[G, H, R])(using context: GContext): GMem[R] =
     context.execute(this, fn)
 
+  def cleanup(): Unit
+end GMem
+
 object GMem:
   type fRGBA = (Float, Float, Float, Float)
 
diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/RamGMem.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/RamGMem.scala
index 43e45f30..3aebacd1 100644
--- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/RamGMem.scala
+++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/RamGMem.scala
@@ -1,9 +1,10 @@
 package io.computenode.cyfra.runtime.mem
 
 import io.computenode.cyfra.dsl.Value
+import io.computenode.cyfra.vulkan.memory.Buffer
 
 import java.nio.ByteBuffer
 
-trait RamGMem[T <: Value, R] extends GMem[T]:
-  protected val data: ByteBuffer
-  def toReadOnlyBuffer: ByteBuffer = data.asReadOnlyBuffer()
+trait RamGMem[T <: Value, R] extends GMem[T] {
+
+}
\ No newline at end of file
diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala
index eaa84e4c..572db49f 100644
--- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala
+++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala
@@ -2,36 +2,92 @@ package io.computenode.cyfra.runtime.mem
 
 import io.computenode.cyfra.dsl.Value.{Float32, Vec4}
 import io.computenode.cyfra.runtime.mem.GMem.fRGBA
+import io.computenode.cyfra.vulkan.memory.Buffer
+import io.computenode.cyfra.runtime.GContext
+import org.lwjgl.vulkan.VK10.*
+import org.lwjgl.util.vma.Vma.*
 
-import org.lwjgl.system.MemoryUtil
 import java.nio.ByteBuffer
 
-class Vec4FloatMem(val size: Int, protected val data: ByteBuffer) extends RamGMem[Vec4[Float32], fRGBA]:
-  def toArray: Array[fRGBA] = {
-    val res = data.asFloatBuffer()
+class Vec4FloatMem(val size: Int, val vulkanBuffer: Buffer) extends RamGMem[Vec4[Float32], fRGBA]:
+  def toArray(using context: GContext): Array[fRGBA] = {
+    val allocator = context.vkContext.allocator
+    val commandPool = context.vkContext.commandPool
+    val bufferSize = size.toLong * Vec4FloatMem.Vec4FloatSize
+
+    val stagingBuffer = new Buffer(
+      bufferSize.toInt,
+      VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+      VMA_MEMORY_USAGE_GPU_TO_CPU,
+      allocator
+    )
+
+    Buffer.copyBuffer(vulkanBuffer, stagingBuffer, bufferSize, commandPool).block().close()
+
+    val byteBuffer = stagingBuffer.map()
+    val floatBuffer = byteBuffer.asFloatBuffer()
     val result = new Array[fRGBA](size)
     for (i <- 0 until size)
-      result(i) = (res.get(), res.get(), res.get(), res.get())
+      result(i) = (floatBuffer.get(), floatBuffer.get(), floatBuffer.get(), floatBuffer.get())
+    
+    stagingBuffer.unmap()
+    stagingBuffer.destroy()
     result
   }
 
+  def cleanup(): Unit =
+    vulkanBuffer.destroy()
 
 object Vec4FloatMem:
   val Vec4FloatSize = 16
 
-  def apply(vecs: Array[fRGBA]): Vec4FloatMem = {
+  def apply(vecs: Array[fRGBA])(using context: GContext): Vec4FloatMem = {
     val size = vecs.length
-    val data = ByteBuffer.allocateDirect(size * Vec4FloatSize)
+    val bufferSize = size.toLong * Vec4FloatSize
+    val allocator = context.vkContext.allocator
+    val commandPool = context.vkContext.commandPool
+
+    val stagingBuffer = new Buffer(
+      bufferSize.toInt,
+      VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+      VMA_MEMORY_USAGE_CPU_ONLY,
+      allocator
+    )
+
+    val byteBuffer = stagingBuffer.map()
+    val floatBuffer = byteBuffer.asFloatBuffer()
     vecs.foreach { case (x, y, z, a) =>
-      data.putFloat(x)
-      data.putFloat(y)
-      data.putFloat(z)
-      data.putFloat(a)
+      floatBuffer.put(x)
+      floatBuffer.put(y)
+      floatBuffer.put(z)
+      floatBuffer.put(a)
     }
-    data.rewind()
-    new Vec4FloatMem(size, data)
+    stagingBuffer.unmap()
+
+    val deviceBuffer = new Buffer(
+      bufferSize.toInt,
+      VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+      0, 
+      VMA_MEMORY_USAGE_GPU_ONLY,
+      allocator
+    )
+
+    Buffer.copyBuffer(stagingBuffer, deviceBuffer, bufferSize, commandPool).block().close()
+    stagingBuffer.destroy()
+
+    new Vec4FloatMem(size, deviceBuffer)
   }
 
-  def apply(size: Int): Vec4FloatMem =
-    val data = ByteBuffer.allocateDirect(size * Vec4FloatSize)
-    new Vec4FloatMem(size, data)
+  def apply(size: Int)(using context: GContext): Vec4FloatMem =
+    val bufferSize = size.toLong * Vec4FloatSize
+    val allocator = context.vkContext.allocator
+    val deviceBuffer = new Buffer(
+      bufferSize.toInt,
+      VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+      0,
+      VMA_MEMORY_USAGE_GPU_ONLY,
+      allocator
+    )
+    new Vec4FloatMem(size, deviceBuffer)
diff --git a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala
index 91c27ec1..2ac9968d 100644
--- a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala
+++ b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala
@@ -40,6 +40,23 @@ private[cyfra] class Buffer(val size: Int, val usage: Int, flags: Int, memUsage:
     check(vmaCreateBuffer(allocator.get, bufferInfo, allocInfo, pBuffer, pAllocation, null), "Failed to create buffer")
     (pBuffer.get(), pAllocation.get())
   }
+  
+  def map(): ByteBuffer = {
+    pushStack { stack =>
+      val pData = stack.callocPointer(1)
+      check(vmaMapMemory(allocator.get, allocation, pData), s"Failed to map buffer memory for buffer handle $handle allocation $allocation")
+      val dataPtr = pData.get(0)
+      if (dataPtr == NULL) {
+        throw new VulkanAssertionError(s"vmaMapMemory returned NULL for buffer handle $handle, allocation $allocation", -1)
+      }
+      memByteBuffer(dataPtr, this.size)
+    }
+  }
+
+  def unmap(): Unit = {
+    org.lwjgl.util.vma.Vma.vmaFlushAllocation(allocator.get, allocation, 0, VK_WHOLE_SIZE)
+    org.lwjgl.util.vma.Vma.vmaUnmapMemory(allocator.get, allocation) 
+  }
 
   def get(dst: Array[Byte]): Unit = {
     val len = Math.min(dst.length, size)

From 0581ef81612843f0240206105156f8956523130f Mon Sep 17 00:00:00 2001
From: rudrabeniwal <rudrabeniwal86@gmail.com>
Date: Wed, 4 Jun 2025 23:39:17 +0530
Subject: [PATCH 3/4] Refactor Vulkan buffer handling in GMem and Buffer
 classes. Update execute methods in GContext and GFunction to support uniform
 structures. Enhance SequenceExecutorTest to utilize new Buffer class for
 input and output operations.

---
 .../cyfra/vulkan/SequenceExecutorTest.scala   |  35 +++-
 .../animation/AnimatedFunctionRenderer.scala  |   5 +-
 .../cyfra/foton/rt/ImageRtRenderer.scala      |   5 +-
 .../rt/animation/AnimationRtRenderer.scala    |   5 +-
 .../computenode/cyfra/runtime/GContext.scala  | 167 ++++++++++++------
 .../computenode/cyfra/runtime/GFunction.scala |  27 +--
 .../computenode/cyfra/runtime/mem/GMem.scala  |  16 +-
 .../cyfra/vulkan/memory/Buffer.scala          |   5 +-
 8 files changed, 175 insertions(+), 90 deletions(-)

diff --git a/cyfra-e2e-test/src/test/scala/io/computenode/cyfra/vulkan/SequenceExecutorTest.scala b/cyfra-e2e-test/src/test/scala/io/computenode/cyfra/vulkan/SequenceExecutorTest.scala
index 31927a59..8657d927 100644
--- a/cyfra-e2e-test/src/test/scala/io/computenode/cyfra/vulkan/SequenceExecutorTest.scala
+++ b/cyfra-e2e-test/src/test/scala/io/computenode/cyfra/vulkan/SequenceExecutorTest.scala
@@ -1,12 +1,14 @@
 package io.computenode.cyfra.vulkan
 
-
 import io.computenode.cyfra.vulkan.compute.{Binding, ComputePipeline, InputBufferSize, LayoutInfo, LayoutSet, Shader}
 import io.computenode.cyfra.vulkan.executor.BufferAction.{LoadFrom, LoadTo}
 import io.computenode.cyfra.vulkan.executor.SequenceExecutor
 import io.computenode.cyfra.vulkan.executor.SequenceExecutor.{ComputationSequence, Compute, Dependency, LayoutLocation}
+import io.computenode.cyfra.vulkan.memory.Buffer
 import munit.FunSuite
 import org.lwjgl.BufferUtils
+import org.lwjgl.vulkan.VK10.* 
+import org.lwjgl.util.vma.Vma.* 
 
 class SequenceExecutorTest extends FunSuite:
   private val vulkanContext = VulkanContext(true)
@@ -24,10 +26,31 @@ class SequenceExecutorTest extends FunSuite:
     )
     val sequenceExecutor = new SequenceExecutor(sequence, vulkanContext)
     val input = 0 until 1024
-    val buffer = BufferUtils.createByteBuffer(input.length * 4)
-    input.foreach(buffer.putInt)
-    buffer.flip()
-    val res = sequenceExecutor.execute(Seq(buffer), input.length)
-    val output = input.map(_ => res.head.getInt)
+    
+    val inputBuffer = new Buffer(
+      input.length * 4, // 4 bytes per int
+      VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+      VMA_MEMORY_USAGE_CPU_ONLY,
+      vulkanContext.allocator
+    )
+    
+    val mappedBuffer = inputBuffer.map()
+    input.foreach(mappedBuffer.putInt)
+    inputBuffer.unmap()
+    
+    val res = sequenceExecutor.execute(Seq(inputBuffer), input.length)
+    
+    val outputMappedBuffer = res.head.map()
+    val output = (0 until input.length).map(_ => outputMappedBuffer.getInt)
+    res.head.unmap()
 
     assertEquals(input.map(_ + 20000).toList, output.toList)
+    
+    // Clean up
+    inputBuffer.destroy()
+    res.foreach(_.destroy())
+    sequenceExecutor.destroy()
+    copy1.destroy()
+    copy2.destroy()
+    shader.destroy()
\ No newline at end of file
diff --git a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/animation/AnimatedFunctionRenderer.scala b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/animation/AnimatedFunctionRenderer.scala
index 514558ec..9c820b29 100644
--- a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/animation/AnimatedFunctionRenderer.scala
+++ b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/animation/AnimatedFunctionRenderer.scala
@@ -32,9 +32,10 @@ class AnimatedFunctionRenderer(params: AnimatedFunctionRenderer.Parameters) exte
   
   protected override def renderFrame(scene: AnimatedFunction, time: Float32, fn: RenderFn): Array[fRGBA] =
     val mem = Array.fill(params.width * params.height)((0.5f, 0.5f, 0.5f, 0.5f))
-    UniformContext.withUniform(AnimationIteration(time)):
+    val uniformStruct = AnimationIteration(time) 
+    UniformContext.withUniform(uniformStruct): 
       val fmem = Vec4FloatMem(mem)
-      fmem.map(fn).asInstanceOf[Vec4FloatMem].toArray
+      fmem.map(uniformStruct, fn).asInstanceOf[Vec4FloatMem].toArray
 
   protected override def renderFunction(scene: AnimatedFunction): RenderFn = 
     GFunction.from2D(params.width, {
diff --git a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/ImageRtRenderer.scala b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/ImageRtRenderer.scala
index 2057a54b..7ddd54a2 100644
--- a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/ImageRtRenderer.scala
+++ b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/ImageRtRenderer.scala
@@ -34,10 +34,11 @@ class ImageRtRenderer(params: ImageRtRenderer.Parameters) extends RtRenderer(par
   private def render(scene: Scene, fn: GFunction[RaytracingIteration, Vec4[Float32], Vec4[Float32]]): LazyList[Array[fRGBA]] =
     val initialMem = Array.fill(params.width * params.height)((0.5f, 0.5f, 0.5f, 0.5f))
     LazyList.iterate((initialMem, 0), params.iterations + 1) { case (mem, render) =>
-      UniformContext.withUniform(RaytracingIteration(render)):
+      val uniformStruct = RaytracingIteration(render) 
+      UniformContext.withUniform(uniformStruct): 
         val fmem = Vec4FloatMem(mem)
         val result = timed(s"Rendered iteration $render")(
-          fmem.map(fn).asInstanceOf[Vec4FloatMem].toArray
+          fmem.map(uniformStruct, fn).asInstanceOf[Vec4FloatMem].toArray
         )
         (result, render + 1)
     }.drop(1).map(_._1)
diff --git a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/animation/AnimationRtRenderer.scala b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/animation/AnimationRtRenderer.scala
index 12e94f4d..b2de521b 100644
--- a/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/animation/AnimationRtRenderer.scala
+++ b/cyfra-foton/src/main/scala/io/computenode/cyfra/foton/rt/animation/AnimationRtRenderer.scala
@@ -29,9 +29,10 @@ class AnimationRtRenderer(params: AnimationRtRenderer.Parameters) extends RtRend
   ): Array[fRGBA] =
     val initialMem = Array.fill(params.width * params.height)((0.5f, 0.5f, 0.5f, 0.5f))
     List.iterate((initialMem, 0), params.iterations + 1) { case (mem, render) =>
-      UniformContext.withUniform(RaytracingIteration(render, time)):
+      val uniformStruct = RaytracingIteration(render, time) 
+      UniformContext.withUniform(uniformStruct): 
         val fmem = Vec4FloatMem(mem)
-        val result = fmem.map(fn).asInstanceOf[Vec4FloatMem].toArray
+        val result = fmem.map(uniformStruct, fn).asInstanceOf[Vec4FloatMem].toArray
         (result, render + 1)
     }.map(_._1).last
 
diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala
index 3a80afca..e93a03ea 100644
--- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala
+++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala
@@ -11,85 +11,140 @@ import SequenceExecutor.*
 import io.computenode.cyfra.runtime.mem.GMem.totalStride
 import io.computenode.cyfra.spirv.SpirvTypes.typeStride
 import io.computenode.cyfra.spirv.compilers.DSLCompiler
-import io.computenode.cyfra.spirv.compilers.ExpressionCompiler.{UniformStructRef, WorkerIndex}
+import io.computenode.cyfra.spirv.compilers.ExpressionCompiler 
+import io.computenode.cyfra.dsl.Expression.E
 import mem.{FloatMem, GMem, Vec4FloatMem}
 import org.lwjgl.system.{Configuration, MemoryUtil}
 import izumi.reflect.Tag
-
-import java.io.FileOutputStream
+import io.computenode.cyfra.vulkan.memory.Buffer
+import org.lwjgl.vulkan.VK10.*
+import org.lwjgl.util.vma.Vma.*
 import java.nio.ByteBuffer
+import java.io.{FileOutputStream, IOException}
 import java.nio.channels.FileChannel
-import java.util.concurrent.Executors
+import scala.collection.mutable 
+import scala.collection.mutable.ListBuffer
 import scala.concurrent.{ExecutionContext, ExecutionContextExecutor}
 
 
-class GContext:
-
-  Configuration.STACK_SIZE.set(1024) // fix lwjgl stack size
+class GContext(debug: Boolean = false):
+  val vkContext = VulkanContext(debug)
+  private val pipelineCache = mutable.Map[Any, ComputePipeline]()
 
-  val vkContext = new VulkanContext(enableValidationLayers = true)
+  private def createPipeline[G <: GStruct[G] : GStructSchema, H <: Value : Tag : FromExpr, R <: Value : Tag : FromExpr](
+    function: GFunction[G, H, R]
+  ): ComputePipeline = {
+    val uniformStructSchemaImpl = summon[GStructSchema[G]]
+    val tagGImpl: Tag[G] = uniformStructSchemaImpl.structTag 
 
-  implicit val ec: ExecutionContextExecutor = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(16))
-
-  def compile[
-    G <: GStruct[G] : Tag : GStructSchema,
-    H <: Value : Tag : FromExpr,
-    R <: Value : Tag : FromExpr
-  ](function: GFunction[G, H, R]): ComputePipeline = {
-    val uniformStructSchema = summon[GStructSchema[G]]
-    val uniformStruct = uniformStructSchema.fromTree(UniformStructRef)
+    val uniformStruct = uniformStructSchemaImpl.fromTree(
+      ExpressionCompiler.UniformStructRef[G](using tagGImpl).asInstanceOf[E[G]]
+    )
     val tree = function
-      .fn
+      .fn 
       .apply(
         uniformStruct,
-        WorkerIndex,
+        ExpressionCompiler.WorkerIndex, 
         GArray[H](0)
       )
-    val shaderCode = DSLCompiler.compile(tree, function.arrayInputs, function.arrayOutputs, uniformStructSchema)
+    val shaderCode = DSLCompiler.compile(tree, function.arrayInputs, function.arrayOutputs, uniformStructSchemaImpl)
     dumpSpvToFile(shaderCode, "program.spv") // TODO remove before release
-    val inOut = 0 to 1 map (Binding(_, InputBufferSize(typeStride(summon[Tag[H]]))))
-    val uniform = Option.when(uniformStructSchema.fields.nonEmpty)(Binding(2, UniformSize(totalStride(uniformStructSchema))))
-    val layoutInfo = LayoutInfo(Seq(LayoutSet(0, inOut ++ uniform)))
+
+    val inputBinding = Binding(0, InputBufferSize(typeStride(summon[Tag[H]])))
+    val outputBinding = Binding(1, InputBufferSize(typeStride(summon[Tag[R]])))
+    
+    val uniformBindingOpt = Option.when(uniformStructSchemaImpl.fields.nonEmpty)(
+      Binding(2, UniformSize(GMem.totalStride(uniformStructSchemaImpl)))
+    )
+    
+    val bindings = Seq(inputBinding, outputBinding) ++ uniformBindingOpt.toSeq
+    val layoutInfo = LayoutInfo(Seq(LayoutSet(0, bindings)))
+    
     val shader = new Shader(shaderCode, new org.joml.Vector3i(256, 1, 1), layoutInfo, "main", vkContext.device)
     new ComputePipeline(shader, vkContext)
   }
 
   private def dumpSpvToFile(code: ByteBuffer, path: String): Unit =
-    val fc: FileChannel = new FileOutputStream("program.spv").getChannel
-    fc.write(code)
-    fc.close()
-    code.rewind()
+    try {
+      val fc: FileChannel = new FileOutputStream(path).getChannel
+      fc.write(code)
+      fc.close()
+    } catch {
+      case e: IOException => e.printStackTrace()
+    } finally {
+      code.rewind()
+    }
 
   def execute[
     G <: GStruct[G] : Tag : GStructSchema,
-    H <: Value,
-    R <: Value
-  ](mem: GMem[H], fn: GFunction[?, H, R])(using uniformContext: UniformContext[_]): GMem[R] =
-    val isUniformEmpty = uniformContext.uniform.schema.fields.isEmpty
-    val actions = Map(
-      LayoutLocation(0, 0) -> BufferAction.LoadTo,
-      LayoutLocation(0, 1) -> BufferAction.LoadFrom
-    ) ++ (
-      if isUniformEmpty then Map.empty 
-      else Map(LayoutLocation(0, 2) -> BufferAction.LoadTo)
-    )
-    val sequence = ComputationSequence(Seq(Compute(fn.pipeline, actions)), Seq.empty)
-    val executor = new SequenceExecutor(sequence, vkContext)
+    H <: Value : Tag : FromExpr, 
+    R <: Value : FromExpr : Tag 
+  ](mem: GMem[H], uniformStruct: G, fn: GFunction[G, H, R]): GMem[R] = {
+    val pipeline = pipelineCache.getOrElseUpdate(fn.fn, createPipeline(fn))
+
+    val sourceBuffersForExecutor = ListBuffer[Buffer]()
+    val bufferActions = mutable.Map[LayoutLocation, BufferAction]()
+
+    bufferActions.put(LayoutLocation(0, 0), BufferAction.LoadTo)
+    sourceBuffersForExecutor.addOne(mem.vulkanBuffer)
+
+    bufferActions.put(LayoutLocation(0, 1), BufferAction.LoadFrom) 
+
+    var uniformStagingBufferOpt: Option[Buffer] = None
+    val uniformStructSchema = summon[GStructSchema[G]]
+    if (uniformStructSchema.fields.nonEmpty) {
+      val uniformCPUByteBuffer = GMem.serializeUniform(uniformStruct)
+      val uniformStagingVkBuffer = new Buffer(
+        uniformCPUByteBuffer.remaining(), // Changed from .toLong to direct Int, or .toInt if remaining() can exceed Int
+        VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+        VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+        VMA_MEMORY_USAGE_CPU_ONLY,
+        vkContext.allocator
+      )
+      val mappedUniform = uniformStagingVkBuffer.map()
+      mappedUniform.put(uniformCPUByteBuffer)
+      uniformStagingVkBuffer.unmap()
+      
+      uniformStagingBufferOpt = Some(uniformStagingVkBuffer)
+      bufferActions.put(LayoutLocation(0, 2), BufferAction.LoadTo)
+      sourceBuffersForExecutor.addOne(uniformStagingVkBuffer)
+    }
+
+    val computeStep = Compute(pipeline, bufferActions.toMap)
+    val sequence = ComputationSequence(Seq(computeStep), dependencies = Nil) 
+    val sequenceExecutor = new SequenceExecutor(sequence, vkContext) 
+
+    val outputVulkanBuffers = sequenceExecutor.execute(sourceBuffersForExecutor.toSeq, mem.size)
     
-    val data = mem.toReadOnlyBuffer
-    val inData =
-      if isUniformEmpty then Seq(data)
-      else Seq(data, GMem.serializeUniform(uniformContext.uniform))
-    val out = executor.execute(inData, mem.size)
-    executor.destroy()
-
-    val outTags = fn.arrayOutputs
-    assert(outTags.size == 1)
-
-    outTags.head match
-      case t if t == Tag[Float32] =>
-        new FloatMem(mem.size, out.head).asInstanceOf[GMem[R]]
-      case t if t == Tag[Vec4[Float32]] =>
-        new Vec4FloatMem(mem.size, out.head).asInstanceOf[GMem[R]]
-      case _ => assert(false, "Supported output types are Float32 and Vec4[Float32]")
+    uniformStagingBufferOpt.foreach(_.destroy())
+
+    if (outputVulkanBuffers.isEmpty) {
+      throw new IllegalStateException("SequenceExecutor did not return an output buffer.")
+    }
+    val resultVulkanBuffer = outputVulkanBuffers.head
+
+    val tagR = summon[Tag[R]]
+    val resultMem = 
+      if (tagR.tag =:= Tag[Float32].tag) { 
+        new FloatMem(mem.size, resultVulkanBuffer).asInstanceOf[GMem[R]]
+      } else if (tagR.tag =:= Tag[Vec4[Float32]].tag) { 
+        new Vec4FloatMem(mem.size, resultVulkanBuffer).asInstanceOf[GMem[R]]
+      } else {
+        resultVulkanBuffer.destroy()
+        throw new UnsupportedOperationException(s"Cannot create GMem for result type ${tagR.tag}. Output buffer has been destroyed.")
+      }
+    resultMem
+  }
+
+  def execute[H <: Value : Tag : FromExpr, R <: Value : FromExpr : Tag]( 
+    mem: GMem[H],
+    fn: GFunction[GStruct.Empty, H, R]
+  ): GMem[R] =
+    execute[GStruct.Empty, H, R](mem, GStruct.Empty(), fn) 
+
+  def cleanup(): Unit = {
+    pipelineCache.values.foreach(_.destroy()) 
+    pipelineCache.clear()
+    vkContext.destroy() 
+  }
 
diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GFunction.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GFunction.scala
index 48c2d5b5..59cca842 100644
--- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GFunction.scala
+++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GFunction.scala
@@ -1,38 +1,39 @@
 package io.computenode.cyfra.runtime
 
-import io.computenode.cyfra.dsl.{*, given}
+import io.computenode.cyfra.dsl.{*, given} 
 import io.computenode.cyfra.dsl.Value.Int32
-import io.computenode.cyfra.vulkan.compute.ComputePipeline
+import io.computenode.cyfra.dsl.Expression.E 
 import izumi.reflect.Tag
 
 case class GFunction[
-  G <: GStruct[G] : GStructSchema : Tag, 
-  H <: Value : Tag : FromExpr, 
+  G <: GStruct[G] : GStructSchema : Tag,
+  H <: Value : Tag : FromExpr,
   R <: Value : Tag : FromExpr
-](fn: (G, Int32, GArray[H]) => R)(implicit context: GContext){
+](
+  val fn: (G, Int32, GArray[H]) => R 
+) {
   def arrayInputs: List[Tag[_]] = List(summon[Tag[H]])
   def arrayOutputs: List[Tag[_]] = List(summon[Tag[R]])
-  val pipeline: ComputePipeline = context.compile(this)
 }
 
 object GFunction:
   def apply[
     H <: Value : Tag : FromExpr,
     R <: Value : Tag : FromExpr
-  ](fn: H => R)(using context: GContext): GFunction[GStruct.Empty, H, R] =
+  ](userSimpleFn: H => R): GFunction[GStruct.Empty, H, R] =
     new GFunction[GStruct.Empty, H, R](
-      (_, index: Int32, gArray: GArray[H]) => fn(gArray.at(index))
+      (_: GStruct.Empty, workerIdx: Int32, gArray: GArray[H]) => userSimpleFn(gArray.at(workerIdx))
     )
 
   def from2D[
     G <: GStruct[G] : GStructSchema : Tag,
     H <: Value : Tag : FromExpr,
     R <: Value : Tag : FromExpr
-  ](width: Int, fn: (G, (Int32, Int32), GArray2D[H]) => R)(using context: GContext): GFunction[G, H, R] =
-    GFunction[G, H, R](
-      (g: G, index: Int32, a: GArray[H]) =>
+  ](width: Int, userFn2D: (G, (Int32, Int32), GArray2D[H]) => R): GFunction[G, H, R] =
+    new GFunction[G, H, R](
+      (g: G, index: Int32, garray: GArray[H]) =>
         val x: Int32 = index mod width
         val y: Int32 = index / width
-        val arr = GArray2D(width, a)
-        fn(g, (x, y), arr)
+        val arr2d = GArray2D(width, garray)
+        userFn2D(g, (x, y), arr2d)
     )
diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala
index 01cd8d0f..1f2c750c 100644
--- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala
+++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/GMem.scala
@@ -9,18 +9,22 @@ import io.computenode.cyfra.spirv.SpirvTypes.typeStride
 import io.computenode.cyfra.runtime.{GFunction, GContext}
 
 import izumi.reflect.Tag
-import org.lwjgl.system.MemoryUtil
-
 import java.nio.ByteBuffer
-import io.computenode.cyfra.vulkan.memory.Buffer
-trait GMem[H <: Value]:
+import io.computenode.cyfra.vulkan.memory.Buffer 
+
+trait GMem[H <: Value : Tag : FromExpr]: 
   def size: Int
   def vulkanBuffer: Buffer
+
   def map[
     G <: GStruct[G] : Tag : GStructSchema,
     R <: Value : FromExpr : Tag
-  ](fn: GFunction[G, H, R])(using context: GContext): GMem[R] =
-    context.execute(this, fn)
+  ](uniformStruct: G, fn: GFunction[G, H, R])(using context: GContext): GMem[R] =
+    context.execute(this, uniformStruct, fn)
+
+  def map[R <: Value : FromExpr : Tag]
+    (fn: GFunction[GStruct.Empty, H, R])(using context: GContext): GMem[R] =
+    context.execute(this, fn) 
 
   def cleanup(): Unit
 end GMem
diff --git a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala
index 2ac9968d..a5426c6f 100644
--- a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala
+++ b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala
@@ -40,7 +40,7 @@ private[cyfra] class Buffer(val size: Int, val usage: Int, flags: Int, memUsage:
     check(vmaCreateBuffer(allocator.get, bufferInfo, allocInfo, pBuffer, pAllocation, null), "Failed to create buffer")
     (pBuffer.get(), pAllocation.get())
   }
-  
+
   def map(): ByteBuffer = {
     pushStack { stack =>
       val pData = stack.callocPointer(1)
@@ -54,8 +54,7 @@ private[cyfra] class Buffer(val size: Int, val usage: Int, flags: Int, memUsage:
   }
 
   def unmap(): Unit = {
-    org.lwjgl.util.vma.Vma.vmaFlushAllocation(allocator.get, allocation, 0, VK_WHOLE_SIZE)
-    org.lwjgl.util.vma.Vma.vmaUnmapMemory(allocator.get, allocation) 
+    vmaUnmapMemory(allocator.get, allocation)
   }
 
   def get(dst: Array[Byte]): Unit = {

From 959e7a6a5125121216f9976bacbb0c7f5a3e2550 Mon Sep 17 00:00:00 2001
From: rudrabeniwal <rudrabeniwal86@gmail.com>
Date: Sat, 7 Jun 2025 03:25:08 +0530
Subject: [PATCH 4/4] Make map/unmap functional (review feedback)

---
 .../computenode/cyfra/runtime/GContext.scala  |  6 +-
 .../cyfra/runtime/mem/FloatMem.scala          | 18 +++---
 .../cyfra/runtime/mem/Vec4FloatMem.scala      | 30 ++++-----
 .../cyfra/vulkan/memory/Buffer.scala          | 61 ++++++++++---------
 4 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala
index e93a03ea..b9491ba2 100644
--- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala
+++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/GContext.scala
@@ -101,9 +101,9 @@ class GContext(debug: Boolean = false):
         VMA_MEMORY_USAGE_CPU_ONLY,
         vkContext.allocator
       )
-      val mappedUniform = uniformStagingVkBuffer.map()
-      mappedUniform.put(uniformCPUByteBuffer)
-      uniformStagingVkBuffer.unmap()
+      uniformStagingVkBuffer.map { mappedUniform =>
+        mappedUniform.put(uniformCPUByteBuffer)
+      }
       
       uniformStagingBufferOpt = Some(uniformStagingVkBuffer)
       bufferActions.put(LayoutLocation(0, 2), BufferAction.LoadTo)
diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala
index ade00101..36414925 100644
--- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala
+++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/FloatMem.scala
@@ -24,11 +24,13 @@ class FloatMem(val size: Int, val vulkanBuffer: Buffer) extends RamGMem[Float32,
 
     Buffer.copyBuffer(vulkanBuffer, stagingBuffer, bufferSize, commandPool).block().close()
     
-    val byteBuffer = stagingBuffer.map()
-    val floatBuffer = byteBuffer.asFloatBuffer()
-    val result = new Array[Float](size)
-    floatBuffer.get(result)
-    stagingBuffer.unmap()
+    val result = stagingBuffer.map { byteBuffer =>
+      val floatBuffer = byteBuffer.asFloatBuffer()
+      val arr = new Array[Float](size)
+      floatBuffer.get(arr)
+      arr
+    }
+
     stagingBuffer.destroy()
     result
 
@@ -52,9 +54,9 @@ object FloatMem {
       allocator
     )
 
-    val byteBuffer = stagingBuffer.map()
-    byteBuffer.asFloatBuffer().put(floats)
-    stagingBuffer.unmap()
+    stagingBuffer.map { byteBuffer =>
+      byteBuffer.asFloatBuffer().put(floats)
+    }
 
     val deviceBuffer = new Buffer(
       bufferSize.toInt,
diff --git a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala
index 572db49f..5c93c70e 100644
--- a/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala
+++ b/cyfra-runtime/src/main/scala/io/computenode/cyfra/runtime/mem/Vec4FloatMem.scala
@@ -25,13 +25,13 @@ class Vec4FloatMem(val size: Int, val vulkanBuffer: Buffer) extends RamGMem[Vec4
 
     Buffer.copyBuffer(vulkanBuffer, stagingBuffer, bufferSize, commandPool).block().close()
 
-    val byteBuffer = stagingBuffer.map()
-    val floatBuffer = byteBuffer.asFloatBuffer()
-    val result = new Array[fRGBA](size)
-    for (i <- 0 until size)
-      result(i) = (floatBuffer.get(), floatBuffer.get(), floatBuffer.get(), floatBuffer.get())
-    
-    stagingBuffer.unmap()
+    val result = stagingBuffer.map { byteBuffer =>
+      val floatBuffer = byteBuffer.asFloatBuffer()
+      val arr = new Array[fRGBA](size)
+      for (i <- 0 until size)
+        arr(i) = (floatBuffer.get(), floatBuffer.get(), floatBuffer.get(), floatBuffer.get())
+      arr
+    }
     stagingBuffer.destroy()
     result
   }
@@ -56,15 +56,15 @@ object Vec4FloatMem:
       allocator
     )
 
-    val byteBuffer = stagingBuffer.map()
-    val floatBuffer = byteBuffer.asFloatBuffer()
-    vecs.foreach { case (x, y, z, a) =>
-      floatBuffer.put(x)
-      floatBuffer.put(y)
-      floatBuffer.put(z)
-      floatBuffer.put(a)
+    stagingBuffer.map { byteBuffer =>
+      val floatBuffer = byteBuffer.asFloatBuffer()
+      vecs.foreach { case (x, y, z, a) =>
+        floatBuffer.put(x)
+        floatBuffer.put(y)
+        floatBuffer.put(z)
+        floatBuffer.put(a)
+      }
     }
-    stagingBuffer.unmap()
 
     val deviceBuffer = new Buffer(
       bufferSize.toInt,
diff --git a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala
index a5426c6f..d7f7ea41 100644
--- a/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala
+++ b/cyfra-vulkan/src/main/scala/io/computenode/cyfra/vulkan/memory/Buffer.scala
@@ -41,28 +41,34 @@ private[cyfra] class Buffer(val size: Int, val usage: Int, flags: Int, memUsage:
     (pBuffer.get(), pAllocation.get())
   }
 
-  def map(): ByteBuffer = {
-    pushStack { stack =>
-      val pData = stack.callocPointer(1)
-      check(vmaMapMemory(allocator.get, allocation, pData), s"Failed to map buffer memory for buffer handle $handle allocation $allocation")
-      val dataPtr = pData.get(0)
-      if (dataPtr == NULL) {
-        throw new VulkanAssertionError(s"vmaMapMemory returned NULL for buffer handle $handle, allocation $allocation", -1)
+  def map[R](f: ByteBuffer => R): R = {
+    var dataPtr: Long = NULL
+    try {
+      dataPtr = pushStack { stack =>
+        val pData = stack.callocPointer(1)
+        check(vmaMapMemory(allocator.get, allocation, pData), s"Failed to map buffer memory for buffer handle $handle allocation $allocation")
+        val ptr = pData.get(0)
+        if (ptr == NULL) {
+          throw new VulkanAssertionError(s"vmaMapMemory returned NULL for buffer handle $handle, allocation $allocation", -1)
+        }
+        ptr
+      }
+      val byteBuffer = memByteBuffer(dataPtr, this.size)
+      f(byteBuffer)
+    } finally {
+      if (dataPtr != NULL) {
+        vmaUnmapMemory(allocator.get, allocation)
       }
-      memByteBuffer(dataPtr, this.size)
     }
   }
 
-  def unmap(): Unit = {
-    vmaUnmapMemory(allocator.get, allocation)
-  }
-
   def get(dst: Array[Byte]): Unit = {
     val len = Math.min(dst.length, size)
-    val byteBuffer = memCalloc(len)
-    Buffer.copyBuffer(this, byteBuffer, len)
-    byteBuffer.get(dst)
-    memFree(byteBuffer)
+    this.map { mappedBuffer =>
+      val bufferSlice = mappedBuffer.slice() 
+      bufferSlice.limit(len)
+      bufferSlice.get(dst, 0, len) 
+    }
   }
 
   protected def close(): Unit =
@@ -70,23 +76,20 @@ private[cyfra] class Buffer(val size: Int, val usage: Int, flags: Int, memUsage:
 }
 
 object Buffer {
-  def copyBuffer(src: ByteBuffer, dst: Buffer, bytes: Long): Unit =
-    pushStack { stack =>
-      val pData = stack.callocPointer(1)
-      check(vmaMapMemory(dst.allocator.get, dst.allocation, pData), "Failed to map destination buffer memory")
-      val data = pData.get()
-      memCopy(memAddress(src), data, bytes)
+  def copyBuffer(src: ByteBuffer, dst: Buffer, bytes: Long): Unit = {
+    dst.map { dstMappedBuffer =>
+      val srcSlice = src.slice()
+      srcSlice.limit(bytes.toInt) 
+      dstMappedBuffer.put(srcSlice)
       vmaFlushAllocation(dst.allocator.get, dst.allocation, 0, bytes)
-      vmaUnmapMemory(dst.allocator.get, dst.allocation)
     }
+  }
 
   def copyBuffer(src: Buffer, dst: ByteBuffer, bytes: Long): Unit =
-    pushStack { stack =>
-      val pData = stack.callocPointer(1)
-      check(vmaMapMemory(src.allocator.get, src.allocation, pData), "Failed to map destination buffer memory")
-      val data = pData.get()
-      memCopy(data, memAddress(dst), bytes)
-      vmaUnmapMemory(src.allocator.get, src.allocation)
+    src.map { srcMappedBuffer =>
+      val srcSlice = srcMappedBuffer.slice()
+      srcSlice.limit(bytes.toInt)
+      dst.put(srcSlice)
     }
 
   def copyBuffer(src: Buffer, dst: Buffer, bytes: Long, commandPool: CommandPool): Fence =