Continue work on buffering

davschneller · davschneller · commit 296a81275b65 · 2026-02-24T00:45:54.000+01:00
diff --git a/tensorforge/backend/instructions/memory/load.py b/tensorforge/backend/instructions/memory/load.py
@@ -28,6 +28,7 @@ def __init__(self, **kwargs):
     self._num_threads = kwargs['num_threads']
     self._permute: None = kwargs['permute']
     self._manual_unroll_threshold = 4
+    self._no_memcpy = kwargs['no_memcpy'] if 'no_memcpy' in kwargs else False
 
     if 'max_load_offset' in kwargs:
       self._max_load_offset = kwargs['max_load_offset']
@@ -54,7 +55,7 @@ def __init__(self, **kwargs):
     self._shr_mem.add_user(self)
     self._is_ready: bool = False
 
-    self._use_cuda_memcpy = self._context.get_vm().get_hw_descr().vendor == 'nvidia'
+    self._use_cuda_memcpy = self._context.get_vm().get_hw_descr().vendor == 'nvidia' and not self._no_memcpy
 
     if self._permute is None:
       self._permute = [i for i in range(len(self._src.obj.shape))]
@@ -252,8 +253,8 @@ def get_permute(self) -> List[int]:
     return self._permute
 
   def _check(self) -> None:
-    if self._src.stype != SymbolType.Global:
-      raise InternalError('shr-load: `src` operand is not in global mem.')
+    #if self._src.stype != SymbolType.Global:
+    #  raise InternalError('shr-load: `src` operand is not in global mem.')
 
     if not isinstance(self._src.obj, Tensor):
       raise InternalError(f'shr-load: `src` operand is not a tensor, instead: {self._src.obj}')
diff --git a/tensorforge/backend/instructions/memory/store.py b/tensorforge/backend/instructions/memory/store.py
@@ -211,10 +211,7 @@ def __init__(self,
                context: Context,
                src: Symbol,
                dest: Symbol,
-               alpha: float,
-               beta: float,
-               num_compute_threads: int,
-               num_active_threads: int):
+               num_threads: int):
     super(StoreShrMemToGlb, self).__init__(context)
 
     #if src.stype != SymbolType.SharedMem:
@@ -225,8 +222,6 @@ def __init__(self,
 
     self._dest = dest
     self._src = src
-    self._alpha = alpha
-    self._beta = beta
     self._num_threads = num_active_threads
     self._is_ready = True
 
diff --git a/tensorforge/backend/instructions/ptr_manip.py b/tensorforge/backend/instructions/ptr_manip.py
@@ -10,13 +10,17 @@ def __init__(self,
                src,
                dest,
                include_extra_offset=True,
-               batch_offset=0):
+               batch_offset=0,
+               update_dest=None,
+               pipeline = False):
     super(GetElementPtr, self).__init__(context)
     self._src = src
     self._dest = dest
     self._include_extra_offset = include_extra_offset
     self._is_ready = True
     self._batch_offset = batch_offset
+    self._update_dest = update_dest
+    self._pipeline = pipeline
 
   def gen_code(self, writer):
 
@@ -30,21 +34,23 @@ def gen_code(self, writer):
 
     datatype = self._vm._fp_type if self._src.obj.datatype is None else self._src.obj.datatype
 
+    const_mod = '' if self._pipeline else 'const'
+
     address = ''
     if isinstance(batch_addressing, StridedAddressing):
       main_offset = f'{GeneralLexicon.BATCH_ID_NAME}{self._batch_offset} * {batch_addressing.stride}'
       sub_offset = f'{batch_obj.get_offset_to_first_element()}'
       address = f'{main_offset} + {batch_addressing.offset} + {sub_offset}{extra_offset}'
       rhs = f'&{self._src.name}[{address}]'
       lhs = 'const ' if self._src.obj.direction == DataFlowDirection.SOURCE else ''
-      lhs += f'{datatype} * const {self._vm.get_lexic().restrict_kw} {self._dest.name}'
+      lhs += f'{datatype} *{const_mod} {self._vm.get_lexic().restrict_kw} {self._dest.name}'
     if batch_addressing == Addressing.STRIDED:
       main_offset = f'{GeneralLexicon.BATCH_ID_NAME}{self._batch_offset} * {batch_obj.get_real_volume()}'
       sub_offset = f'{batch_obj.get_offset_to_first_element()}'
       address = f'{main_offset} + {sub_offset}{extra_offset}'
       rhs = f'&{self._src.name}[{address}]'
       lhs = 'const ' if self._src.obj.direction == DataFlowDirection.SOURCE else ''
-      lhs += f'{datatype} * const {self._vm.get_lexic().restrict_kw} {self._dest.name}'
+      lhs += f'{datatype} *{const_mod} {self._vm.get_lexic().restrict_kw} {self._dest.name}'
     elif batch_addressing == Addressing.PTR_BASED:
       main_offset = f'{GeneralLexicon.BATCH_ID_NAME}{self._batch_offset}'
       sub_offset = f'{batch_obj.get_offset_to_first_element()}'
@@ -57,19 +63,23 @@ def gen_code(self, writer):
         rhs = f'(tensorforge::SpacePtrRestrict<{lhs}, tensorforge::GlobalMemspace>){rhs}'
         lhs = f'auto {self._dest.name}'
       else:
-        lhs += f'{datatype} * const {self._vm.get_lexic().restrict_kw} {self._dest.name}'
+        lhs += f'{datatype} *{const_mod} {self._vm.get_lexic().restrict_kw} {self._dest.name}'
     elif batch_addressing == Addressing.NONE:
       address = f'{batch_obj.get_offset_to_first_element()}'
       rhs = f'&{self._src.name}[{address}]'
       lhs = 'const ' if self._src.obj.direction == DataFlowDirection.SOURCE else ''
-      lhs += f'{datatype} * const {self._vm.get_lexic().restrict_kw} {self._dest.name}'
+      lhs += f'{datatype} *{const_mod} {self._vm.get_lexic().restrict_kw} {self._dest.name}'
     elif batch_addressing == Addressing.SCALAR:
       rhs = f'{self._src.name}'
       lhs = f'{datatype} {self._dest.name}'
     else:
       GenerationError(f'unknown addressing of {self._src.name}, given {batch_addressing}')
 
-    writer(f'{lhs} = {rhs};')
+    if self._update_dest:
+      writer(f'const auto {self._update_dest.name} = {self._dest.name};')
+      writer(f'{self._dest.name} = {rhs};')
+    else:
+      writer(f'{lhs} = {rhs};')
 
   def __str__(self) -> str:
     return f'{self._dest.name} = getelementptr_b2g {self._src.name};'
diff --git a/tensorforge/backend/opt/multibuffer.py b/tensorforge/backend/opt/multibuffer.py
@@ -2,8 +2,8 @@
 from .abstract import AbstractTransformer, Context, AbstractInstruction
 from tensorforge.backend.instructions.compute import ComputeInstruction
 from tensorforge.backend.instructions.memory import AbstractShrMemWrite, MemoryInstruction
-from tensorforge.backend.instructions.memory.load import LoadInstruction, LoadWait, GlbToRegLoader
-from tensorforge.backend.instructions.memory.store import StoreRegToReg
+from tensorforge.backend.instructions.memory.load import LoadInstruction, LoadWait, GlbToRegLoader, GlbToShrLoader
+from tensorforge.backend.instructions.memory.store import StoreRegToReg, StoreShrMemToGlb
 from tensorforge.backend.instructions.ptr_manip import GetElementPtr
 from tensorforge.backend.instructions.allocate import RegisterAlloc
 from tensorforge.backend.symbol import SymbolType, Symbol
@@ -12,18 +12,23 @@
 class MultiBuffer(AbstractTransformer):
   def __init__(self,
                context: Context,
-               instructions: List[AbstractInstruction]):
+               instructions: List[AbstractInstruction],
+               shm, scopes):
     super(MultiBuffer, self).__init__(context, instructions)
     self._global_instrs = []
+    self._shm = shm
+    self._shm_symbol = scopes.get_symbol(self._shm)
 
   def apply(self) -> None:
+    earlystop = False
+
     globalinstrs = []
     newinstrs = []
 
     epmap = {}
 
     for i, instr in enumerate(self._instrs):
-        if isinstance(instr, LoadInstruction) and not isinstance(instr, LoadWait):
+        if isinstance(instr, GlbToRegLoader):
             newregs = deepcopy(instr._dest.obj)
             newregs.name = f'preload_{newregs.name}'
             newregsym = Symbol(newregs.name, SymbolType.Register, newregs)
@@ -43,13 +48,35 @@ def apply(self) -> None:
             newinstrs += [LoadWait(newload1)]
             newinstrs += [StoreRegToReg(self._context, newregsym, instr._dest, instr._num_threads)]
             newinstrs += [newload2]
-        elif isinstance(instr, GetElementPtr) or isinstance(instr, RegisterAlloc):
+        elif isinstance(instr, GlbToShrLoader):
+            newshrsym = Symbol(f'preload_{instr._dest.name}', SymbolType.SharedMem, instr._dest.obj)
+            newshrsym.data_view = instr._dest.data_view
+            newshrsym.num_threads = instr._dest.num_threads
+            newshrsym.datatype = instr._dest.datatype
+            newsym = Symbol(f'next_{instr._src.name}', instr._src.stype, instr._src.obj)
+            newsym.data_view = instr._src.data_view
+            newsym.num_threads = instr._src.num_threads
+            newsym.datatype = instr._src.datatype
+            newload1 = GlbToShrLoader(context=self._context, src=newsym, dest=newshrsym, shr_mem=self._shm_symbol, num_threads=instr._num_threads, permute=None)
+            newload2 = GlbToShrLoader(context=self._context, src=newsym, dest=newshrsym, shr_mem=self._shm_symbol, num_threads=instr._num_threads, permute=None)
+            globalinstrs += [GetElementPtr(self._context, epmap[instr._src.name], newsym, batch_offset=1)]
+            globalinstrs += [newload1]
+            newinstrs += [GetElementPtr(self._context, epmap[instr._src.name], newsym, batch_offset=1)]
+            newinstrs += [LoadWait(newload1)]
+            newinstrs += [GlbToShrLoader(context=self._context, src=newshrsym, dest=instr._dest, shr_mem=self._shm_symbol, num_threads=instr._num_threads, permute=None, no_memcpy=True)]
+            newinstrs += [newload2]
+        elif isinstance(instr, GetElementPtr) or isinstance(instr, RegisterAlloc) or isinstance(instr, LoadWait):
             newinstrs += [instr]
 
             # hack
             if isinstance(instr, GetElementPtr):
                 epmap[instr._dest.name] = instr._src
         else:
-            self._global_instrs += globalinstrs
-            self._instrs = newinstrs + self._instrs[i:]
-            break
+            if earlystop:
+                newinstrs += self._instrs[i:]
+                break
+            else:
+                newinstrs += [instr]
+
+    self._instrs = newinstrs
+    self._global_instrs += globalinstrs
diff --git a/tensorforge/backend/opt/optimizer.py b/tensorforge/backend/opt/optimizer.py
@@ -10,32 +10,40 @@
 from .remove_redundancy import RemoveRedundancyOpt
 from .memmove import MoveLoads
 from .multibuffer import MultiBuffer
+from .ptrpipe import PtrPipe
 
 class OptimizationStage:
   def __init__(self,
                context: Context,
                shr_mem: ShrMemObject,
                instructions: List[AbstractInstruction],
-               num_threads: int):
+               num_threads: int,
+               scopes):
     self._context = context
     self._shr_mem: ShrMemObject = shr_mem
     self._instrs: List[AbstractInstruction] = instructions
     self._global_instrs: List[AbstractInstruction] = []
     self._num_instrs: int = len(instructions)
     self._user_options = context.get_user_options()
     self._num_threads = num_threads
+    self._scopes = scopes
 
   def optimize(self):
     opt = MoveLoads(self._context, self._instrs)
     opt.apply()
     self._instrs = opt.get_instructions()
 
-    opt = MultiBuffer(self._context, self._instrs)
+    opt = MultiBuffer(self._context, self._instrs, self._shr_mem, self._scopes)
     opt.apply()
     self._instrs = opt.get_instructions()
     self._global_instrs = opt._global_instrs
 
-    opt = LivenessAnalysis(self._context, self._instrs)
+    opt = PtrPipe(self._context, self._instrs)
+    opt.apply()
+    self._instrs = opt.get_instructions()
+    self._global_instrs += opt._global_instrs
+
+    opt = LivenessAnalysis(self._context, self._global_instrs + self._instrs)
     opt.apply()
     live_map: Dict[int, Set[Symbol]] = opt.get_live_map()
 
diff --git a/tensorforge/backend/opt/ptrpipe.py b/tensorforge/backend/opt/ptrpipe.py
@@ -0,0 +1,34 @@
+from typing import List
+from .abstract import AbstractTransformer, Context, AbstractInstruction
+from tensorforge.backend.instructions.compute import ComputeInstruction
+from tensorforge.backend.instructions.memory import AbstractShrMemWrite, MemoryInstruction
+from tensorforge.backend.instructions.memory.load import LoadInstruction, LoadWait, GlbToRegLoader
+from tensorforge.backend.instructions.memory.store import StoreRegToReg
+from tensorforge.backend.instructions.ptr_manip import GetElementPtr
+from tensorforge.backend.instructions.allocate import RegisterAlloc
+from tensorforge.backend.symbol import SymbolType, Symbol
+from copy import deepcopy
+
+class PtrPipe(AbstractTransformer):
+  def __init__(self,
+               context: Context,
+               instructions: List[AbstractInstruction]):
+    super(PtrPipe, self).__init__(context, instructions)
+    self._global_instrs = []
+
+  def apply(self) -> None:
+    globalinstrs = []
+    newinstrs = []
+
+    for i, instr in enumerate(self._instrs):
+      if isinstance(instr, GetElementPtr):
+        newdest = Symbol(f'preload{instr._batch_offset}_{instr._src.name}', instr._src.stype, instr._src.obj)
+        newgep = GetElementPtr(self._context, instr._src, newdest, batch_offset=instr._batch_offset + 1, update_dest=instr._dest)
+        newinstrs += [newgep]
+        newgepstart = GetElementPtr(self._context, instr._src, newdest, batch_offset=instr._batch_offset + 1, pipeline=True)
+        globalinstrs += [newgepstart]
+      else:
+        newinstrs += [instr]
+
+    self._instrs = newinstrs
+    self._global_instrs = globalinstrs
diff --git a/tensorforge/generators/generator.py b/tensorforge/generators/generator.py
@@ -139,7 +139,8 @@ def generate(self):
       opt = OptimizationStage(context=self._context,
                               shr_mem=self._section.shr_mem_obj,
                               instructions=self._section.ir,
-                              num_threads=self._num_threads)
+                              num_threads=self._num_threads,
+                              scopes = self._scopes)
       opt.optimize()
       self._section.ir = opt.get_instructions()
       self._section.global_ir += opt.get_global_instructions()
@@ -191,6 +192,7 @@ def _generate_kernel(self):
 
           writer(f'const auto {GeneralLexicon.BATCH_ID_NAME}_start = {start};')
           writer(f'const auto {GeneralLexicon.BATCH_ID_NAME}1 = {GeneralLexicon.BATCH_ID_NAME}_start < {GeneralLexicon.NUM_ELEMENTS}{i} ? {GeneralLexicon.BATCH_ID_NAME}_start : 0;')
+          writer(f'const auto {GeneralLexicon.BATCH_ID_NAME}2 = {GeneralLexicon.BATCH_ID_NAME}1 + {stride} < {GeneralLexicon.NUM_ELEMENTS}{i} ? {GeneralLexicon.BATCH_ID_NAME}1 + {stride} : {GeneralLexicon.BATCH_ID_NAME}1;')
 
           for instruction in section.global_ir:
             if instruction.is_ready():
@@ -214,6 +216,7 @@ def generate_inner():
 
             with writer.For(f'size_t {GeneralLexicon.BATCH_ID_NAME}0 = {start}; {GeneralLexicon.BATCH_ID_NAME}0 < {GeneralLexicon.NUM_ELEMENTS}{i}; {GeneralLexicon.BATCH_ID_NAME}0 += {stride}'):
               writer(f'const auto {GeneralLexicon.BATCH_ID_NAME}1 = {GeneralLexicon.BATCH_ID_NAME}0 + {stride} < {GeneralLexicon.NUM_ELEMENTS}{i} ? {GeneralLexicon.BATCH_ID_NAME}0 + {stride} : {GeneralLexicon.BATCH_ID_NAME}0;')
+              writer(f'const auto {GeneralLexicon.BATCH_ID_NAME}2 = {GeneralLexicon.BATCH_ID_NAME}1 + {stride} < {GeneralLexicon.NUM_ELEMENTS}{i} ? {GeneralLexicon.BATCH_ID_NAME}1 + {stride} : {GeneralLexicon.BATCH_ID_NAME}1;')
               generate_inner()
           elif self._clusterlaunchcontrol:
             writer(f'__shared__ tensorforge::ClusterLaunchCtrl launchctrl;')
@@ -621,7 +624,10 @@ def _get_element_size_guard(self, i):
     return f'{GeneralLexicon.BATCH_ID_NAME}0 < {GeneralLexicon.NUM_ELEMENTS}{i}'
 
   def _get_flag_guard(self, writer, i):
-    writer(f'bool allowed = true;')
-    with writer.If(f'{GeneralLexicon.FLAGS_NAME}{i} != nullptr'):
-      writer(f'allowed = static_cast<bool>({GeneralLexicon.FLAGS_NAME}{i}[{GeneralLexicon.BATCH_ID_NAME}0]);')
-    return 'allowed'
+    if False:
+      writer(f'bool allowed = true;')
+      with writer.If(f'{GeneralLexicon.FLAGS_NAME}{i} != nullptr'):
+        writer(f'allowed = static_cast<bool>({GeneralLexicon.FLAGS_NAME}{i}[{GeneralLexicon.BATCH_ID_NAME}0]);')
+      return 'allowed'
+    else:
+      return 'true'