Skip to content
Draft
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions dace/codegen/targets/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
from dace.sdfg.graph import MultiConnectorEdge
from dace.sdfg.state import ControlFlowRegion, StateSubgraphView
from dace.transformation import helpers as xfh
import dace.transformation
import dace.transformation.dataflow
import dace.transformation.dataflow.add_threadblock_map
from dace.transformation.passes import analysis as ap

if TYPE_CHECKING:
Expand Down Expand Up @@ -153,6 +156,8 @@ def preprocess(self, sdfg: SDFG) -> None:
'CUDA',
target_type=target_type)

sdfg.apply_transformations_once_everywhere(dace.transformation.dataflow.add_threadblock_map.AddThreadBlockMap, )

# Find GPU<->GPU strided copies that cannot be represented by a single copy command
from dace.transformation.dataflow import CopyToMap
for e, state in list(sdfg.all_edges_recursive()):
Expand Down
149 changes: 149 additions & 0 deletions dace/transformation/dataflow/add_threadblock_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
""" This module contains classes and functions that implement the grid-strided map tiling
transformation."""

import dace
from dace.sdfg import SDFG, SDFGState
from dace.properties import make_properties, SymbolicProperty
from dace.sdfg import nodes
from dace.sdfg import utils as sdutil
from dace.transformation import transformation
from dace.transformation.dataflow.tiling import MapTiling
from dace import dtypes
import warnings


@make_properties
class AddThreadBlockMap(transformation.SingleStateTransformation):
"""
Adds a thread block schedule to a device map scope
"""

map_entry = transformation.PatternNode(nodes.MapEntry)

# Properties
thread_block_size_x = SymbolicProperty(dtype=int,
default=None,
allow_none=True,
desc="Number threads in the threadBlock X Dim")
thread_block_size_y = SymbolicProperty(dtype=int,
default=None,
allow_none=True,
desc="Number threads in the threadBlock Y Dim")
thread_block_size_z = SymbolicProperty(dtype=int,
default=None,
allow_none=True,
desc="Number threads in the threadBlock Z Dim")
tiles_evenly = SymbolicProperty(dtype=bool,
default=False,
desc="Whether the map should be tiled evenly or not. If False, the "
"transformation will try to tile the map as evenly as possible.")

@classmethod
def expressions(cls):
return [sdutil.node_path_graph(cls.map_entry)]

def preprocess_default_dims(self):
# If None is passed for the pass we will get the default configs
# 1. If arguments are passed:
# 1.1 Is the arguments passed
# 2. If no arguments are passed (at least one arg is None):
# 2.1. First check if the device map has gpu_block_size set
# 2.2. Otherwise check the global default
if self.thread_block_size_x is None or self.thread_block_size_y is None or self.thread_block_size_z is None:
if self.map_entry.gpu_block_size is not None:
# If gpu_block_size is set, use it
self.thread_block_size_x = self.map_entry.gpu_block_size[0]
self.thread_block_size_y = self.map_entry.gpu_block_size[1]
self.thread_block_size_z = self.map_entry.gpu_block_size[2]
else:
x, y, z = dace.config.Config.get('compiler', 'cuda', 'default_block_size').split(',')
try:
self.thread_block_size_x = int(x)
self.thread_block_size_y = int(y)
self.thread_block_size_z = int(z)
except ValueError:
raise ValueError("Invalid default block size format. Expected 'x,y,z' where x, y, z are integers.")

num_dims_in_map = len(self.map_entry.map.range)
# Collapse missing thread block dimensions into y if 2 dimensions in the map, to x if 1 dimension in the map
if num_dims_in_map < 3:
print_warning = False
old_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z)
if num_dims_in_map == 2:
self.thread_block_size_y *= self.thread_block_size_z
if self.thread_block_size_z > 1:
print_warning = True
self.thread_block_size_z = 1
elif num_dims_in_map == 1:
self.thread_block_size_x *= self.thread_block_size_y * self.thread_block_size_z
if self.thread_block_size_y > 1 or self.thread_block_size_z > 1:
print_warning = True
self.thread_block_size_y = 1
self.thread_block_size_z = 1
new_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am thinking that it would be simpler to write something similar to:

old_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z)
new_block = list(old_block)
for d in range(3, num_dims_in_map, -1):
    new_block[d-1] *= new_block[d]
    new_block[d] = 1
new_block = tuple(new_block)
if new_block != old_block:
    warnings.warn ...

It may not be immediately as readable as the current code though, so this just a suggestion.

if print_warning:
warnings.warn(
UserWarning, f'Default block size has more dimensions ({old_block}) than kernel dimensions '
f'({num_dims_in_map}) in map "{self.map_entry.map.label}". Linearizing block '
f'size to {new_block}. Consider setting the ``gpu_block_size`` property.')

def can_be_applied(self, graph, expr_index, sdfg, permissive=False):
self.preprocess_default_dims()

if self.thread_block_size_x * self.thread_block_size_y * self.thread_block_size_z > 1024:
return False

if self.map_entry.map.schedule != dtypes.ScheduleType.GPU_Device:
return False

kernel_nodes = graph.all_nodes_between(self.map_entry, graph.exit_node(self.map_entry))
for node in kernel_nodes:
if (isinstance(node, nodes.MapEntry)
and (node.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlock
or node.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlockDynamic)):
# If the map already has a thread block schedule, do not apply
return False

return True

def update_names():
pass

def apply(self, state: SDFGState, sdfg: SDFG):
self.preprocess_default_dims()

map_entry = self.map_entry

tx = self.thread_block_size_x
ty = self.thread_block_size_y
tz = self.thread_block_size_z
block_dims = [tz, ty, tx]

# The thread block sizes depend on the number of dimensions we have
# GPU code gen maps the params i0:...,i1:...,i2:... respectively to blockDim.z,.y,.x
# If more tile sizes are given than the available number of parameters cull the list and ignore
# the additional parameters
tile_sizes = [1] * len(map_entry.map.params)
used_dimensions = min(3, len(map_entry.map.params))
tile_sizes[-used_dimensions:] = block_dims[-used_dimensions:]
applied_gpu_block_dims = [1, 1, 1]
applied_gpu_block_dims[-used_dimensions:] = block_dims[-used_dimensions:]

# Tile trivial simplifies come checks for the BlockCoarsening and ThreadCoarsening transformations
MapTiling.apply_to(
sdfg=sdfg,
options=dict(
prefix="b",
tile_sizes=tile_sizes,
divides_evenly=self.tiles_evenly, # Todo improve this
tile_trivial=True,
skew=True),
map_entry=map_entry)

# The old dev_entry is the new tblock_map_entry
map_entry.map.schedule = dtypes.ScheduleType.GPU_ThreadBlock

@staticmethod
def annotates_memlets():
return False
Loading