Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
from dace.sdfg.graph import MultiConnectorEdge
from dace.sdfg.state import ControlFlowRegion, StateSubgraphView
from dace.transformation import helpers as xfh
import dace.transformation
import dace.transformation.dataflow
import dace.transformation.dataflow.add_threadblock_map
from dace.transformation.passes import analysis as ap

if TYPE_CHECKING:
Expand Down Expand Up @@ -153,6 +156,8 @@ def preprocess(self, sdfg: SDFG) -> None:
'CUDA',
target_type=target_type)

sdfg.apply_transformations_once_everywhere(dace.transformation.dataflow.add_threadblock_map.AddThreadBlockMap, )

# Find GPU<->GPU strided copies that cannot be represented by a single copy command
from dace.transformation.dataflow import CopyToMap
for e, state in list(sdfg.all_edges_recursive()):
Expand Down
157 changes: 157 additions & 0 deletions dace/transformation/dataflow/add_threadblock_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
""" This module contains classes and functions that implement the grid-strided map tiling
transformation."""

import dace
from dace.sdfg import SDFG, ControlFlowRegion, SDFGState
from dace.properties import make_properties, SymbolicProperty
from dace.sdfg import nodes
from dace.sdfg import utils as sdutil
from dace.transformation import transformation
from dace.transformation.dataflow.tiling import MapTiling
from dace import dtypes
import warnings


@make_properties
class AddThreadBlockMap(transformation.SingleStateTransformation):
"""
Adds a thread block schedule to a device map scope
"""

map_entry = transformation.PatternNode(nodes.MapEntry)

# Properties
thread_block_size_x = SymbolicProperty(dtype=int,
default=None,
allow_none=True,
desc="Number threads in the threadBlock X Dim")
thread_block_size_y = SymbolicProperty(dtype=int,
default=None,
allow_none=True,
desc="Number threads in the threadBlock Y Dim")
thread_block_size_z = SymbolicProperty(dtype=int,
default=None,
allow_none=True,
desc="Number threads in the threadBlock Z Dim")
tiles_evenly = SymbolicProperty(dtype=bool,
default=False,
desc="Whether the map should be tiled evenly or not. If False, the "
"transformation will try to tile the map as evenly as possible.")

@classmethod
def expressions(cls):
return [sdutil.node_path_graph(cls.map_entry)]

def preprocess_default_dims(self):
# If None is passed for the pass we will get the default configs
# 1. If arguments are passed:
# 1.1 Is the arguments passed
# 2. If no arguments are passed (at least one arg is None):
# 2.1. First check if the device map has gpu_block_size set
# 2.2. Otherwise check the global default
if self.thread_block_size_x is None or self.thread_block_size_y is None or self.thread_block_size_z is None:
if self.map_entry.gpu_block_size is not None:
# If gpu_block_size is set, use it
self.thread_block_size_x = self.map_entry.gpu_block_size[0]
self.thread_block_size_y = self.map_entry.gpu_block_size[1]
self.thread_block_size_z = self.map_entry.gpu_block_size[2]
else:
x, y, z = dace.config.Config.get('compiler', 'cuda', 'default_block_size').split(',')
try:
self.thread_block_size_x = int(x)
self.thread_block_size_y = int(y)
self.thread_block_size_z = int(z)
except ValueError:
raise ValueError("Invalid default block size format. Expected 'x,y,z' where x, y, z are integers.")

num_dims_in_map = len(self.map_entry.map.range)
# Collapse missing thread block dimensions into y if 2 dimensions in the map, to x if 1 dimension in the map
if num_dims_in_map < 3:
print_warning = False
old_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z)
if num_dims_in_map == 2:
self.thread_block_size_y *= self.thread_block_size_z
if self.thread_block_size_z > 1:
print_warning = True
self.thread_block_size_z = 1
elif num_dims_in_map == 1:
self.thread_block_size_x *= self.thread_block_size_y * self.thread_block_size_z
if self.thread_block_size_y > 1 or self.thread_block_size_z > 1:
print_warning = True
self.thread_block_size_y = 1
self.thread_block_size_z = 1
new_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am thinking that it would be simpler to write something similar to:

old_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z)
new_block = list(old_block)
for d in range(3, num_dims_in_map, -1):
    new_block[d-1] *= new_block[d]
    new_block[d] = 1
new_block = tuple(new_block)
if new_block != old_block:
    warnings.warn ...

It may not be immediately as readable as the current code though, so this just a suggestion.

if print_warning:
warnings.warn(
UserWarning, f'Default block size has more dimensions ({old_block}) than kernel dimensions '
f'({num_dims_in_map}) in map "{self.map_entry.map.label}". Linearizing block '
f'size to {new_block}. Consider setting the ``gpu_block_size`` property.')

def can_be_applied(self, graph, expr_index, sdfg, permissive=False):
self.preprocess_default_dims()

if self.thread_block_size_x * self.thread_block_size_y * self.thread_block_size_z > 1024:
return False

if self.map_entry.map.schedule != dtypes.ScheduleType.GPU_Device:
return False

# Recursively enter NestedSDFGs (and CFGs, only check if MapEntry node)
kernel_nodes = set(graph.all_nodes_between(self.map_entry, graph.exit_node(self.map_entry)))
while kernel_nodes:
node = kernel_nodes.pop(0)
# If NestedSDFG or ControlFlowRegion add further nodes, otherwise just check the popped
if isinstance(node, dace.nodes.NestedSDFG):
kernel_nodes = kernel_nodes.union(node.sdfg.nodes())
elif isinstance(node, ControlFlowRegion):
kernel_nodes = kernel_nodes.union(node.nodes())

if (isinstance(node, nodes.MapEntry)
and (node.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlock
or node.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlock_Dynamic)):
# If the map already has a thread block schedule, do not apply
return False

return True

def update_names():
pass

def apply(self, state: SDFGState, sdfg: SDFG):
self.preprocess_default_dims()

map_entry = self.map_entry

tx = self.thread_block_size_x
ty = self.thread_block_size_y
tz = self.thread_block_size_z
block_dims = [tz, ty, tx]

# The thread block sizes depend on the number of dimensions we have
# GPU code gen maps the params i0:...,i1:...,i2:... respectively to blockDim.z,.y,.x
# If more tile sizes are given than the available number of parameters cull the list and ignore
# the additional parameters
tile_sizes = [1] * len(map_entry.map.params)
used_dimensions = min(3, len(map_entry.map.params))
tile_sizes[-used_dimensions:] = block_dims[-used_dimensions:]
applied_gpu_block_dims = [1, 1, 1]
applied_gpu_block_dims[-used_dimensions:] = block_dims[-used_dimensions:]

# Tile trivial simplifies come checks for the BlockCoarsening and ThreadCoarsening transformations
MapTiling.apply_to(
sdfg=sdfg,
options=dict(
prefix="b",
tile_sizes=tile_sizes,
divides_evenly=self.tiles_evenly, # Todo improve this
tile_trivial=True,
skew=True),
map_entry=map_entry)

# The old dev_entry is the new tblock_map_entry
map_entry.map.schedule = dtypes.ScheduleType.GPU_ThreadBlock

@staticmethod
def annotates_memlets():
return False
Loading