-
Notifications
You must be signed in to change notification settings - Fork 149
Add ThreadBlock Maps as Preprocessing #2048
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
ThrudPrimrose
wants to merge
8
commits into
main
Choose a base branch
from
tblock_map_preprocessing
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
fecdb50
If no threadblock map in a GPU device kernel's scoep add it as prepro…
ThrudPrimrose 35eb604
refactor for CI
ThrudPrimrose 952e7a0
Improve pass to accept default config schema value
ThrudPrimrose 6f51797
1
ThrudPrimrose d8b4969
Restore default behaviors as expected
ThrudPrimrose aa68b1a
Refactor
ThrudPrimrose c97a8fd
Update
ThrudPrimrose c8fae8a
Check nodes recursively now
ThrudPrimrose File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,157 @@ | ||
| # Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. | ||
| """ This module contains classes and functions that implement the grid-strided map tiling | ||
| transformation.""" | ||
|
|
||
| import dace | ||
| from dace.sdfg import SDFG, ControlFlowRegion, SDFGState | ||
| from dace.properties import make_properties, SymbolicProperty | ||
| from dace.sdfg import nodes | ||
| from dace.sdfg import utils as sdutil | ||
| from dace.transformation import transformation | ||
| from dace.transformation.dataflow.tiling import MapTiling | ||
| from dace import dtypes | ||
| import warnings | ||
|
|
||
|
|
||
| @make_properties | ||
| class AddThreadBlockMap(transformation.SingleStateTransformation): | ||
| """ | ||
| Adds a thread block schedule to a device map scope | ||
| """ | ||
|
|
||
| map_entry = transformation.PatternNode(nodes.MapEntry) | ||
|
|
||
| # Properties | ||
| thread_block_size_x = SymbolicProperty(dtype=int, | ||
| default=None, | ||
| allow_none=True, | ||
| desc="Number threads in the threadBlock X Dim") | ||
| thread_block_size_y = SymbolicProperty(dtype=int, | ||
| default=None, | ||
| allow_none=True, | ||
| desc="Number threads in the threadBlock Y Dim") | ||
| thread_block_size_z = SymbolicProperty(dtype=int, | ||
| default=None, | ||
| allow_none=True, | ||
| desc="Number threads in the threadBlock Z Dim") | ||
| tiles_evenly = SymbolicProperty(dtype=bool, | ||
| default=False, | ||
| desc="Whether the map should be tiled evenly or not. If False, the " | ||
| "transformation will try to tile the map as evenly as possible.") | ||
|
|
||
| @classmethod | ||
| def expressions(cls): | ||
| return [sdutil.node_path_graph(cls.map_entry)] | ||
|
|
||
| def preprocess_default_dims(self): | ||
| # If None is passed for the pass we will get the default configs | ||
| # 1. If arguments are passed: | ||
| # 1.1 Is the arguments passed | ||
| # 2. If no arguments are passed (at least one arg is None): | ||
| # 2.1. First check if the device map has gpu_block_size set | ||
| # 2.2. Otherwise check the global default | ||
| if self.thread_block_size_x is None or self.thread_block_size_y is None or self.thread_block_size_z is None: | ||
| if self.map_entry.gpu_block_size is not None: | ||
| # If gpu_block_size is set, use it | ||
| self.thread_block_size_x = self.map_entry.gpu_block_size[0] | ||
| self.thread_block_size_y = self.map_entry.gpu_block_size[1] | ||
| self.thread_block_size_z = self.map_entry.gpu_block_size[2] | ||
| else: | ||
| x, y, z = dace.config.Config.get('compiler', 'cuda', 'default_block_size').split(',') | ||
| try: | ||
| self.thread_block_size_x = int(x) | ||
| self.thread_block_size_y = int(y) | ||
| self.thread_block_size_z = int(z) | ||
| except ValueError: | ||
| raise ValueError("Invalid default block size format. Expected 'x,y,z' where x, y, z are integers.") | ||
|
|
||
| num_dims_in_map = len(self.map_entry.map.range) | ||
| # Collapse missing thread block dimensions into y if 2 dimensions in the map, to x if 1 dimension in the map | ||
| if num_dims_in_map < 3: | ||
| print_warning = False | ||
| old_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z) | ||
| if num_dims_in_map == 2: | ||
| self.thread_block_size_y *= self.thread_block_size_z | ||
| if self.thread_block_size_z > 1: | ||
| print_warning = True | ||
| self.thread_block_size_z = 1 | ||
| elif num_dims_in_map == 1: | ||
| self.thread_block_size_x *= self.thread_block_size_y * self.thread_block_size_z | ||
| if self.thread_block_size_y > 1 or self.thread_block_size_z > 1: | ||
| print_warning = True | ||
| self.thread_block_size_y = 1 | ||
| self.thread_block_size_z = 1 | ||
| new_block = (self.thread_block_size_x, self.thread_block_size_y, self.thread_block_size_z) | ||
| if print_warning: | ||
| warnings.warn( | ||
| UserWarning, f'Default block size has more dimensions ({old_block}) than kernel dimensions ' | ||
| f'({num_dims_in_map}) in map "{self.map_entry.map.label}". Linearizing block ' | ||
| f'size to {new_block}. Consider setting the ``gpu_block_size`` property.') | ||
|
|
||
| def can_be_applied(self, graph, expr_index, sdfg, permissive=False): | ||
| self.preprocess_default_dims() | ||
|
|
||
| if self.thread_block_size_x * self.thread_block_size_y * self.thread_block_size_z > 1024: | ||
| return False | ||
|
|
||
| if self.map_entry.map.schedule != dtypes.ScheduleType.GPU_Device: | ||
| return False | ||
|
|
||
| # Recursively enter NestedSDFGs (and CFGs, only check if MapEntry node) | ||
| kernel_nodes = set(graph.all_nodes_between(self.map_entry, graph.exit_node(self.map_entry))) | ||
| while kernel_nodes: | ||
| node = kernel_nodes.pop(0) | ||
| # If NestedSDFG or ControlFlowRegion add further nodes, otherwise just check the popped | ||
| if isinstance(node, dace.nodes.NestedSDFG): | ||
| kernel_nodes = kernel_nodes.union(node.sdfg.nodes()) | ||
| elif isinstance(node, ControlFlowRegion): | ||
| kernel_nodes = kernel_nodes.union(node.nodes()) | ||
|
|
||
| if (isinstance(node, nodes.MapEntry) | ||
| and (node.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlock | ||
| or node.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlock_Dynamic)): | ||
| # If the map already has a thread block schedule, do not apply | ||
| return False | ||
|
|
||
| return True | ||
|
|
||
| def update_names(): | ||
| pass | ||
|
|
||
| def apply(self, state: SDFGState, sdfg: SDFG): | ||
| self.preprocess_default_dims() | ||
|
|
||
| map_entry = self.map_entry | ||
|
|
||
| tx = self.thread_block_size_x | ||
| ty = self.thread_block_size_y | ||
| tz = self.thread_block_size_z | ||
| block_dims = [tz, ty, tx] | ||
|
|
||
| # The thread block sizes depend on the number of dimensions we have | ||
| # GPU code gen maps the params i0:...,i1:...,i2:... respectively to blockDim.z,.y,.x | ||
| # If more tile sizes are given than the available number of parameters cull the list and ignore | ||
| # the additional parameters | ||
| tile_sizes = [1] * len(map_entry.map.params) | ||
| used_dimensions = min(3, len(map_entry.map.params)) | ||
| tile_sizes[-used_dimensions:] = block_dims[-used_dimensions:] | ||
| applied_gpu_block_dims = [1, 1, 1] | ||
| applied_gpu_block_dims[-used_dimensions:] = block_dims[-used_dimensions:] | ||
|
|
||
| # Tile trivial simplifies come checks for the BlockCoarsening and ThreadCoarsening transformations | ||
| MapTiling.apply_to( | ||
| sdfg=sdfg, | ||
| options=dict( | ||
| prefix="b", | ||
| tile_sizes=tile_sizes, | ||
| divides_evenly=self.tiles_evenly, # Todo improve this | ||
| tile_trivial=True, | ||
| skew=True), | ||
| map_entry=map_entry) | ||
|
|
||
| # The old dev_entry is the new tblock_map_entry | ||
| map_entry.map.schedule = dtypes.ScheduleType.GPU_ThreadBlock | ||
|
|
||
| @staticmethod | ||
| def annotates_memlets(): | ||
| return False | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am thinking that it would be simpler to write something similar to:
It may not be immediately as readable as the current code though, so this just a suggestion.