Implment double buffering

davschneller · davschneller · commit 4e5bcb8fe673 · 2026-02-21T03:54:32.000+01:00
diff --git a/tensorforge/backend/instructions/builders/ptr_manip_builder.py b/tensorforge/backend/instructions/builders/ptr_manip_builder.py
@@ -12,7 +12,7 @@ class GetElementPtrBuilder(AbstractBuilder):
   def __init__(self, context: Context, scopes: Scopes):
     super(GetElementPtrBuilder, self).__init__(context, scopes)
 
-  def build(self, src: Symbol, include_extra_offset: bool = True):
+  def build(self, src: Symbol, include_extra_offset: bool = True, batch_offset = 0):
     self._reset()
 
     dstype = src.stype
@@ -32,6 +32,6 @@ def build(self, src: Symbol, include_extra_offset: bool = True):
     self._scopes.add_symbol(dest)
 
     if src.stype != SymbolType.Data:
-      self._instructions.append(GetElementPtr(self._context, src, dest, include_extra_offset))
+      self._instructions.append(GetElementPtr(self._context, src, dest, include_extra_offset, batch_offset))
 
     src.add_user(self)
diff --git a/tensorforge/backend/instructions/memory/load.py b/tensorforge/backend/instructions/memory/load.py
@@ -332,7 +332,7 @@ def gen_code_inner(self, writer: Writer) -> None:
 
         start = (total_size // granularity) * granularity
 
-    elif self._context.get_vm().get_hw_descr().vendor in ['amd']:
+    elif self._context.get_vm().get_hw_descr().vendor in ['amd'] and False:
 
       # float4 load
 
@@ -354,10 +354,11 @@ def gen_code_inner(self, writer: Writer) -> None:
       for g in [4, 2, 1]: # [4, 3, 2, 1]
         # 4x4
         # writer(f'const auto f{g}idx = (threadIdx.x % {g}) * {self._num_threads} + (threadIdx.x / {g}) * {g};')
+        total_count_g = (total_count // g) * g
 
-        writer(f'const auto f{g}idx = ((threadIdx.x / {16 // g}) % {g}) * {self._num_threads} + (threadIdx.x % {16 // g}) * {g} + (threadIdx.x / 16) * 16;')
+        if start != total_count_g:
+          writer(f'const auto f{g}idx = ((threadIdx.x / {16 // g}) % {g}) * {self._num_threads} + (threadIdx.x % {16 // g}) * {g} + (threadIdx.x / 16) * 16;')
 
-        total_count_g = (total_count // g) * g
         for i in range(start, total_count_g, g):
           sidx = i // lead_count
           ridx = i % lead_count
diff --git a/tensorforge/backend/instructions/ptr_manip.py b/tensorforge/backend/instructions/ptr_manip.py
@@ -1,20 +1,22 @@
 from .abstract_instruction import AbstractInstruction
-from tensorforge.common.vm.vm import VM
+from tensorforge.common.context import Context
 from tensorforge.common.helper import get_extra_offset_name, Addressing
 from tensorforge.common.basic_types import GeneralLexicon, DataFlowDirection, StridedAddressing
 from tensorforge.common.exceptions import GenerationError
 
 class GetElementPtr(AbstractInstruction):
   def __init__(self,
-               vm: VM,
+               context: Context,
                src,
                dest,
-               include_extra_offset=True):
-    super(GetElementPtr, self).__init__(vm)
+               include_extra_offset=True,
+               batch_offset=0):
+    super(GetElementPtr, self).__init__(context)
     self._src = src
     self._dest = dest
     self._include_extra_offset = include_extra_offset
     self._is_ready = True
+    self._batch_offset = batch_offset
 
   def gen_code(self, writer):
 
@@ -30,21 +32,21 @@ def gen_code(self, writer):
 
     address = ''
     if isinstance(batch_addressing, StridedAddressing):
-      main_offset = f'{GeneralLexicon.BATCH_ID_NAME} * {batch_addressing.stride}'
+      main_offset = f'{GeneralLexicon.BATCH_ID_NAME}{self._batch_offset} * {batch_addressing.stride}'
       sub_offset = f'{batch_obj.get_offset_to_first_element()}'
       address = f'{main_offset} + {batch_addressing.offset} + {sub_offset}{extra_offset}'
       rhs = f'&{self._src.name}[{address}]'
       lhs = 'const ' if self._src.obj.direction == DataFlowDirection.SOURCE else ''
       lhs += f'{datatype} * const {self._vm.get_lexic().restrict_kw} {self._dest.name}'
     if batch_addressing == Addressing.STRIDED:
-      main_offset = f'{GeneralLexicon.BATCH_ID_NAME} * {batch_obj.get_real_volume()}'
+      main_offset = f'{GeneralLexicon.BATCH_ID_NAME}{self._batch_offset} * {batch_obj.get_real_volume()}'
       sub_offset = f'{batch_obj.get_offset_to_first_element()}'
       address = f'{main_offset} + {sub_offset}{extra_offset}'
       rhs = f'&{self._src.name}[{address}]'
       lhs = 'const ' if self._src.obj.direction == DataFlowDirection.SOURCE else ''
       lhs += f'{datatype} * const {self._vm.get_lexic().restrict_kw} {self._dest.name}'
     elif batch_addressing == Addressing.PTR_BASED:
-      main_offset = f'{GeneralLexicon.BATCH_ID_NAME}'
+      main_offset = f'{GeneralLexicon.BATCH_ID_NAME}{self._batch_offset}'
       sub_offset = f'{batch_obj.get_offset_to_first_element()}'
       address = f'{main_offset}][{sub_offset}{extra_offset}'
       src_suffix = '_ptr' if self._vm.get_lexic()._backend == 'targetdart' else ''
diff --git a/tensorforge/backend/opt/multibuffer.py b/tensorforge/backend/opt/multibuffer.py
@@ -0,0 +1,55 @@
+from typing import List
+from .abstract import AbstractTransformer, Context, AbstractInstruction
+from tensorforge.backend.instructions.compute import ComputeInstruction
+from tensorforge.backend.instructions.memory import AbstractShrMemWrite, MemoryInstruction
+from tensorforge.backend.instructions.memory.load import LoadInstruction, LoadWait, GlbToRegLoader
+from tensorforge.backend.instructions.memory.store import StoreRegToReg
+from tensorforge.backend.instructions.ptr_manip import GetElementPtr
+from tensorforge.backend.instructions.allocate import RegisterAlloc
+from tensorforge.backend.symbol import SymbolType, Symbol
+from copy import deepcopy
+
+class MultiBuffer(AbstractTransformer):
+  def __init__(self,
+               context: Context,
+               instructions: List[AbstractInstruction]):
+    super(MultiBuffer, self).__init__(context, instructions)
+    self._global_instrs = []
+
+  def apply(self) -> None:
+    globalinstrs = []
+    newinstrs = []
+
+    epmap = {}
+
+    for i, instr in enumerate(self._instrs):
+        if isinstance(instr, LoadInstruction) and not isinstance(instr, LoadWait):
+            newregs = deepcopy(instr._dest.obj)
+            newregs.name = f'preload_{newregs.name}'
+            newregsym = Symbol(newregs.name, SymbolType.Register, newregs)
+            newregsym.data_view = instr._dest.data_view
+            newregsym.num_threads = instr._dest.num_threads
+            newregsym.datatype = instr._dest.datatype
+            newsym = Symbol(f'next_{instr._src.name}', instr._src.stype, instr._src.obj)
+            newsym.data_view = instr._src.data_view
+            newsym.num_threads = instr._src.num_threads
+            newsym.datatype = instr._src.datatype
+            newload1 = GlbToRegLoader(self._context, newsym, newregsym, instr._num_threads, instr._linearize)
+            newload2 = GlbToRegLoader(self._context, newsym, newregsym, instr._num_threads, instr._linearize)
+            globalinstrs += [GetElementPtr(self._context, epmap[instr._src.name], newsym, batch_offset=1)]
+            globalinstrs += [RegisterAlloc(self._context, newregsym, 0, 0.0)]
+            globalinstrs += [newload1]
+            newinstrs += [GetElementPtr(self._context, epmap[instr._src.name], newsym, batch_offset=1)]
+            newinstrs += [LoadWait(newload1)]
+            newinstrs += [StoreRegToReg(self._context, newregsym, instr._dest, instr._num_threads)]
+            newinstrs += [newload2]
+        elif isinstance(instr, GetElementPtr) or isinstance(instr, RegisterAlloc):
+            newinstrs += [instr]
+
+            # hack
+            if isinstance(instr, GetElementPtr):
+                epmap[instr._dest.name] = instr._src
+        else:
+            self._global_instrs += globalinstrs
+            self._instrs = newinstrs + self._instrs[i:]
+            break
diff --git a/tensorforge/backend/opt/optimizer.py b/tensorforge/backend/opt/optimizer.py
@@ -9,6 +9,7 @@
 from .sync_block import SyncThreadsOpt
 from .remove_redundancy import RemoveRedundancyOpt
 from .memmove import MoveLoads
+from .multibuffer import MultiBuffer
 
 class OptimizationStage:
   def __init__(self,
@@ -19,6 +20,7 @@ def __init__(self,
     self._context = context
     self._shr_mem: ShrMemObject = shr_mem
     self._instrs: List[AbstractInstruction] = instructions
+    self._global_instrs: List[AbstractInstruction] = []
     self._num_instrs: int = len(instructions)
     self._user_options = context.get_user_options()
     self._num_threads = num_threads
@@ -28,6 +30,11 @@ def optimize(self):
     opt.apply()
     self._instrs = opt.get_instructions()
 
+    opt = MultiBuffer(self._context, self._instrs)
+    opt.apply()
+    self._instrs = opt.get_instructions()
+    self._global_instrs = opt._global_instrs
+
     opt = LivenessAnalysis(self._context, self._instrs)
     opt.apply()
     live_map: Dict[int, Set[Symbol]] = opt.get_live_map()
@@ -63,3 +70,6 @@ def optimize(self):
 
   def get_instructions(self):
     return self._instrs
+
+  def get_global_instructions(self):
+    return self._global_instrs
diff --git a/tensorforge/generators/generator.py b/tensorforge/generators/generator.py
@@ -142,6 +142,7 @@ def generate(self):
                               num_threads=self._num_threads)
       opt.optimize()
       self._section.ir = opt.get_instructions()
+      self._section.global_ir += opt.get_global_instructions()
 
       # add final sync for persistent threads
       if self._persistent_threading or self._clusterlaunchcontrol:
@@ -173,6 +174,24 @@ def _generate_kernel(self):
 
       for i,section in enumerate(self._sections):
         with writer.AnonymousScope():
+
+          offset = []
+          idx = i - 1
+          for ssection in reversed(self._sections[:i]):
+            if ssection.barrier:
+              break
+            offset += [f'{GeneralLexicon.NUM_ELEMENTS}{idx}']
+            idx -= 1
+
+          stride = f'({vm.get_lexic().grid_dim_x} * {vm.get_lexic().block_dim_y})'
+          if len(offset) == 0:
+            start = self._get_2d_block_id()
+          else:
+            start = f'({self._get_2d_block_id()} + {" + ".join(offset)}) % {stride}'
+
+          writer(f'const auto {GeneralLexicon.BATCH_ID_NAME}_start = {start};')
+          writer(f'const auto {GeneralLexicon.BATCH_ID_NAME}1 = {GeneralLexicon.BATCH_ID_NAME}_start < {GeneralLexicon.NUM_ELEMENTS}{i} ? {GeneralLexicon.BATCH_ID_NAME}_start : 0;')
+
           for instruction in section.global_ir:
             if instruction.is_ready():
               instruction.gen_code(writer)
@@ -193,37 +212,24 @@ def generate_inner():
             # TODO: OMP target
             # TODO: maybe iterate over adjacent elements? (for indirect pointers)
 
-            offset = []
-            idx = i - 1
-            for ssection in reversed(self._sections[:i]):
-              if ssection.barrier:
-                break
-              offset += [f'{GeneralLexicon.NUM_ELEMENTS}{idx}']
-              idx -= 1
-
-            stride = f'({vm.get_lexic().grid_dim_x} * {vm.get_lexic().block_dim_y})'
-            if len(offset) == 0:
-              start = self._get_2d_block_id()
-            else:
-              start = f'({self._get_2d_block_id()} + {" + ".join(offset)}) % {stride}'
-
-            with writer.For(f'size_t {GeneralLexicon.BATCH_ID_NAME} = {start}; {GeneralLexicon.BATCH_ID_NAME} < {GeneralLexicon.NUM_ELEMENTS}{i}; {GeneralLexicon.BATCH_ID_NAME} += {stride}'):
+            with writer.For(f'size_t {GeneralLexicon.BATCH_ID_NAME}0 = {start}; {GeneralLexicon.BATCH_ID_NAME}0 < {GeneralLexicon.NUM_ELEMENTS}{i}; {GeneralLexicon.BATCH_ID_NAME}0 += {stride}'):
+              writer(f'const auto {GeneralLexicon.BATCH_ID_NAME}1 = {GeneralLexicon.BATCH_ID_NAME}0 + {stride} < {GeneralLexicon.NUM_ELEMENTS}{i} ? {GeneralLexicon.BATCH_ID_NAME}0 + {stride} : {GeneralLexicon.BATCH_ID_NAME}0;')
               generate_inner()
           elif self._clusterlaunchcontrol:
             writer(f'__shared__ tensorforge::ClusterLaunchCtrl launchctrl;')
             writer(f'int phase = 0;')
             writer(f'launchctrl.init();')
-            writer(f'size_t {GeneralLexicon.BATCH_ID_NAME} = {self._get_2d_block_id()};')
+            writer(f'size_t {GeneralLexicon.BATCH_ID_NAME}0 = {self._get_2d_block_id()};')
             with writer.While(f'true'):
               writer('launchctrl.setupNext();')
               with writer.If(f'{self._get_element_size_guard(i)}'):
                 generate_inner()
               writer('const auto nextBlock = launchctrl.queryNext(phase);')
               with writer.If('!nextBlock.has_value()'):
                 writer('break;')
-              writer(f'{GeneralLexicon.BATCH_ID_NAME} = {self._get_2d_block_id("nextBlock.value()")};')
+              writer(f'{GeneralLexicon.BATCH_ID_NAME}0 = {self._get_2d_block_id("nextBlock.value()")};')
           else:
-            writer(f'const size_t {GeneralLexicon.BATCH_ID_NAME} = {self._get_2d_block_id()};')
+            writer(f'const size_t {GeneralLexicon.BATCH_ID_NAME}0 = {self._get_2d_block_id()};')
             with writer.If(f'{self._get_element_size_guard(i)}'):
               generate_inner()
 
@@ -612,10 +618,10 @@ def _get_2d_block_id(self, block=None):
     return f'{lexic.thread_idx_y} + {lexic.block_dim_y} * ({block})'
 
   def _get_element_size_guard(self, i):
-    return f'{GeneralLexicon.BATCH_ID_NAME} < {GeneralLexicon.NUM_ELEMENTS}{i}'
+    return f'{GeneralLexicon.BATCH_ID_NAME}0 < {GeneralLexicon.NUM_ELEMENTS}{i}'
 
   def _get_flag_guard(self, writer, i):
     writer(f'bool allowed = true;')
     with writer.If(f'{GeneralLexicon.FLAGS_NAME}{i} != nullptr'):
-      writer(f'allowed = static_cast<bool>({GeneralLexicon.FLAGS_NAME}{i}[{GeneralLexicon.BATCH_ID_NAME}]);')
+      writer(f'allowed = static_cast<bool>({GeneralLexicon.FLAGS_NAME}{i}[{GeneralLexicon.BATCH_ID_NAME}0]);')
     return 'allowed'