Skip to content
Draft
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
5e2d0dc
Add minimal bug example.
andreaskuster Jan 8, 2021
f4763fc
Further reduce minimal example.
andreaskuster Jan 8, 2021
89f87c3
Account for offset to center.
andreaskuster Jan 8, 2021
e1caeb1
Add fpga0 sdk env vars script
andreaskuster Jan 9, 2021
34389ae
Add larger jacobi3d example
andreaskuster Jan 9, 2021
b1bac07
Add temporary fix.
andreaskuster Jan 9, 2021
d300e59
Merge remote-tracking branch 'origin/fix_cross_dependent_path' into f…
andreaskuster Jan 9, 2021
e83e4e3
Increase problem size.
andreaskuster Jan 9, 2021
9da97a3
Add more complex example.
andreaskuster Jan 10, 2021
74f8d21
Merge remote-tracking branch 'origin/fix_cross_dependent_path' into f…
andreaskuster Jan 10, 2021
4000eca
Add minimal bug example.
andreaskuster Jan 8, 2021
f0e2e3b
Further reduce minimal example.
andreaskuster Jan 8, 2021
8023480
Account for offset to center.
andreaskuster Jan 8, 2021
0afabe0
Add temporary fix.
andreaskuster Jan 9, 2021
c4b83c5
Add fpga0 sdk env vars script
andreaskuster Jan 9, 2021
ab8c555
Add larger jacobi3d example
andreaskuster Jan 9, 2021
47cc666
Add more complex example.
andreaskuster Jan 10, 2021
ed1dcb8
Increase problem size.
andreaskuster Jan 9, 2021
97fe0fd
Merge remote-tracking branch 'origin/fix_cross_dependent_path' into f…
andreaskuster Sep 11, 2021
ff683e1
Make example more distinct to test function correctness.
andreaskuster Sep 11, 2021
0e35d03
Add path inclusion for direct file execution.
andreaskuster Sep 11, 2021
fb9966c
Several readme extension
andreaskuster Sep 11, 2021
67a6b93
Remove horidiff hotfix
andreaskuster Sep 11, 2021
47dfc58
Update README.md
andreaskuster Sep 13, 2021
6b737df
Update README.md
andreaskuster Sep 13, 2021
6a27a5f
Move test config to default location
andreaskuster Sep 13, 2021
b9ae43b
Remove local env setup
andreaskuster Sep 13, 2021
2a73675
Merge remote-tracking branch 'origin/fix_cross_dependent_path' into f…
andreaskuster Sep 13, 2021
86ab704
Add extended horidiff example.
andreaskuster Sep 14, 2021
d442f68
Reduce min channel depth to 1024
andreaskuster Sep 15, 2021
6a0cf1b
Update dace version
andreaskuster Sep 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ To run the code, the following software must be available:
- Python 3.6.x or newer.
- The `virtualenv` module (installed with `pip install virtualenv`).
- A C++17-capable compiler (e.g., GCC 7.x or Clang 6.x).
- graphviz (for graph plotting support)
- One or both FPGA compilers:
- Intel FPGA OpenCL SDK (tested with 18.1.1 and 19.1)
- Xilinx Vitis (tested with 2020.2)
Expand Down Expand Up @@ -47,6 +48,13 @@ kernel source files themselves in:
.dacecache/<kernel name>/src/intel_fpga/device
```

To run low-level analysis of the buffer size and stencil program visualization, you can invoke the executable `stencilflow/kernel_chain_graph.py`.
Example usage:

```bash
stencilflow/kernel_chain_graph.py -stencil_file test/stencils/jacobi3d_32x32x32_8itr_8vec.json -plot -simulate -report -optimize
```

Verification
------------

Expand Down Expand Up @@ -81,3 +89,16 @@ It is a known issue that launching multiple Intel FPGA kernels in quick
succession (such as is done in the tests) can sometimes fail sporadically,
seemingly due to file I/O issues. Running individual programs should never fail.

Publication
-----------

If you use StencilFlow, cite us:
```bibtex
@inproceedings{dace,
author = {Johannes de Fine Licht, Andreas Kuster, Tiziano De Matteis, Tal Ben-Nun, Dominic Hofer, Torsten Hoefler},
title = {StencilFlow: Mapping Large Stencil Programs to Distributed Spatial Computing Systems},
year = {2021},
booktitle = {Proceedings of the IEEE/ACM International Symposium on Code Generation and Optimization (CGO)},
series = {CGO '21}
}
```
84 changes: 84 additions & 0 deletions bug_min.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
{
"inputs": {
"inA": {
"data": "inA_float32.dat",
"data_type": "float32",
"input_dims": [
"i"
]
}
},
"outputs": [
"out"
],
"dimensions": [
10,
10,
10
],
"vectorization": 1,
"program": {
"k0": {
"data_type": "float32",
"computation_string": "k0 = inA[i]",
"boundary_conditions": {
"inA": {
"type": "constant",
"value": 0.0
}
}
},
"k1": {
"data_type": "float32",
"computation_string": "k1 = inA[i]",
"boundary_conditions": {
"inA": {
"type": "constant",
"value": 0.0
}
}
},
"k2": {
"data_type": "float32",
"computation_string": "k2 = k1[i, j, k] + k0[i+1, j, k] + k0[i, j, k]",
"boundary_conditions": {
"k1": {
"type": "constant",
"value": 0.0
},
"k0": {
"type": "constant",
"value": 0.0
}
}
},
"k3": {
"data_type": "float32",
"computation_string": "k3 = k0[i, j, k] + k1[i+1, j+1, k+1] + k1[i, j, k]",
"boundary_conditions": {
"k0": {
"type": "constant",
"value": 0.0
},
"k1": {
"type": "constant",
"value": 0.0
}
}
},
"out": {
"data_type": "float32",
"computation_string": "out = k2[i, j, k] + k3[i, j, k]",
"boundary_conditions": {
"k2":{
"type": "constant",
"value": 0.0
},
"k3": {
"type": "constant",
"value": 0.0
}
}
}
}
}
94 changes: 94 additions & 0 deletions bug_min_ext.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"inputs": {
"inA": {
"data": "inA_float32.dat",
"data_type": "float32",
"input_dims": [
"i"
]
}
},
"outputs": [
"out"
],
"dimensions": [
8,
8,
8
],
"vectorization": 1,
"program": {
"k0": {
"data_type": "float32",
"computation_string": "k0 = inA[i]",
"boundary_conditions": {
"inA": {
"type": "constant",
"value": 0.0
}
}
},
"k1": {
"data_type": "float32",
"computation_string": "k1 = inA[i]",
"boundary_conditions": {
"inA": {
"type": "constant",
"value": 0.0
}
}
},
"k2": {
"data_type": "float32",
"computation_string": "k2 = k1[i, j, k] + k0[i + 1, j, k] + k0[i, j, k]",
"boundary_conditions": {
"k1": {
"type": "constant",
"value": 0.0
},
"k0": {
"type": "constant",
"value": 0.0
}
}
},
"k3": {
"data_type": "float32",
"computation_string": "k3 = k0[i, j, k] + k4[i + 1, j, k] + k4[i, j, k]",
"boundary_conditions": {
"k0": {
"type": "constant",
"value": 0.0
},
"k4": {
"type": "constant",
"value": 0.0
}
}
},
"k4": {
"data_type": "float32",
"computation_string": "k4 = k1[i, j, k] + k1[i+1, j, k]",
"boundary_conditions": {
"k1": {
"type": "constant",
"value": 0.0
}
}
},
"out": {
"data_type": "float32",
"computation_string": "out = k2[i,j,k] + k3[i,j,k]",
"boundary_conditions": {
"k2":{
"type": "constant",
"value": 0.0
},
"k3": {
"type": "constant",
"value": 0.0
}
}
}
}
}
49 changes: 49 additions & 0 deletions stencilflow/kernel_chain_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import operator
import re
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(__file__)))

from typing import Any, List, Dict, Tuple

Expand Down Expand Up @@ -85,6 +87,29 @@ def __init__(self,
if self.log_level >= LogLevel.MODERATE:
print("Compute delay buffer sizes.")
self.compute_delay_buffer() # compute the delay buffer sizes

for node in self.graph.nodes():
if node.name == "__tmp_T" or node.name == "__tmp_T_sqr_s_1351":
name = "u_tmp"
max_size = self.dimensions[0]*self.dimensions[1]
node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
if node.name == "__tmp_S" or node.name == "__tmp_S_sqr_uv_1352":
name = "v_tmp"
max_size = self.dimensions[0] * self.dimensions[1]
node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
if node.name == "__tmp_T_sqr_s_1351":
name = "ms_sdfg_1330___local_frac_1_dx_1660"
max_size = self.dimensions[0]*self.dimensions[1]
node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
if node.name == "__tmp_S_sqr_uv_1352":
name = "ms_sdfg_1330___local_frac_1_dx_1660"
max_size = self.dimensions[0] * self.dimensions[1]
node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)

if self.log_level >= LogLevel.MODERATE:
print("Add channels to the graph edges.")
# plot kernel graphs if flag set to true
Expand Down Expand Up @@ -507,6 +532,11 @@ def compute_delay_buffer(self) -> None:
dimensions=self.dimensions,
index=stencilflow.list_subtract_cwise(
max_delay[:-1], entry[:-1]))

if not isinstance(node, Output):
max_offset = node.dist_to_center[max(node.dist_to_center, key=lambda x: node.dist_to_center[x])]
max_size = max_offset - node.dist_to_center[entry[-1]]

node.delay_buffer[name] = BoundedQueue(name=name,
maxsize=max_size)
node.delay_buffer[name].import_data(
Expand Down Expand Up @@ -789,6 +819,14 @@ def runtime_lower_bound(self):
type=int)
parser.add_argument("-report", action="store_true")
parser.add_argument("-simulate", action="store_true")
parser.add_argument("-opt", action="store_true")
parser.add_argument("-opt_goal", default=["min_fast_mem", 12000], nargs="+")
"""
choices:
- min_com_vol, FAST_MEM_BOUND, SLOW_MEM_BOUND
- min_fast_mem, COM_VOL_BOUND
- opt_ratio, RATIO
"""
args = parser.parse_args()
args.log_level = stencilflow.log_level.LogLevel(args.log_level)
program_description = stencilflow.parse_json(args.stencil_file)
Expand All @@ -809,6 +847,17 @@ def runtime_lower_bound(self):
log_level=LogLevel(args.log_level))
sim.simulate()

# choose optimization goal
if args.opt:
from stencilflow import Optimizer
opt = Optimizer(self.kernel_nodes, self.dimensions)
if args.opt_goal[0] == "min_com_vol":
opt.minimize_comm_vol(fast_memory_bound=args.opt_goal[1], slow_memory_bound=args.opt_goal[2])
if args.opt_goal[0] == "min_fast_mem":
opt.minimize_fast_mem(communication_volume_bound=args.opt_goal[1])
if args.opt_goal[0] == "opt_ratio":
opt.optimize_to_ratio(ratio=args.opt_goal[1])

# output a report if argument -report is true
if args.report:
chain.report(args.stencil_file)
Expand Down
24 changes: 24 additions & 0 deletions test/stencils/jacobi3d_512x512x512.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"inputs": {
"a": {
"data": "data/zeros_32x32x32_fp32.dat",
"data_type": "float32"
}
},
"outputs": ["b"],
"dimensions": [512, 512, 512],
"program": {
"b": {
"computation_string":
"b = 0.16666666 * (a[i-1,j,k] + a[i+1,j,k] + a[i,j-1,k] + a[i,j+1,k] + a[i,j,k-1] + a[i,j,k+1])",
"boundary_conditions": {
"a": {
"type": "constant",
"value": 1.0
}
},
"data_type":
"float32"
}
}
}
16 changes: 16 additions & 0 deletions vars.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# intel fpga
export INTELFPGAOCLSDKROOT=/opt/intelFPGA_pro/19.1/hld
export PATH=$INTELFPGAOCLSDKROOT/bin/:$PATH
export AOCL_BOARD_PACKAGE_ROOT=$INTELFPGAOCLSDKROOT/board/bittware_pcie/s10
# /opt/intelFPGA_pro/19.4/hld/board/bittware_pcie/s10/board_env.xml
# /opt/intelFPGA_pro/19.4/hld/board/bittware_pcie/s10_hpc_default/board_env.xml
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$AOCL_BOARD_PACKAGE_ROOT/linux64/lib

# xilinx fpga
export PATH=/opt/Xilinx/Vitis/2019.2/bin:/opt/Xilinx/Vitis_HLS/2019.2/bin:/opt/Xilinx/Vivado/2019.2/bin:$PATH
export XILINX_XRT=/opt/xilinx/xrt
export PATH=$XILINX_XRT/bin:$PATH
export LD_LIBRARY_PATH=$XILINX_XRT/lib:$LD_LIBRARY_PATH
export XILINXD_LICENSE_FILE=2100@sgv-license-01
export LIBRARY_PATH=/usr/lib/x86_64-linux-gnu