spcl · andreaskuster · Jan 8, 2021 · Jan 8, 2021 · Jan 8, 2021 · Jan 9, 2021
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ To run the code, the following software must be available:
 - Python 3.6.x or newer.
 - The `virtualenv` module (installed with `pip install virtualenv`).
 - A C++17-capable compiler (e.g., GCC 7.x or Clang 6.x).
+- graphviz (for graph plotting support)
 - One or both FPGA compilers:
   - Intel FPGA OpenCL SDK (tested with 18.1.1 and 19.1)
   - Xilinx Vitis (tested with 2020.2) 
@@ -47,6 +48,13 @@ kernel source files themselves in:
 .dacecache/<kernel name>/src/intel_fpga/device
 ```
 
+To run low-level analysis of the buffer size and stencil program visualization, you can invoke the executable `stencilflow/kernel_chain_graph.py`.
+Example usage:
+
+```bash
+stencilflow/kernel_chain_graph.py -stencil_file test/stencils/jacobi3d_32x32x32_8itr_8vec.json -plot -simulate -report -optimize
+```
+
 Verification
 ------------
 
@@ -81,3 +89,16 @@ It is a known issue that launching multiple Intel FPGA kernels in quick
 succession (such as is done in the tests) can sometimes fail sporadically,
 seemingly due to file I/O issues. Running individual programs should never fail.
 
+Publication
+-----------
+
+If you use StencilFlow, cite us:
+```bibtex
+@inproceedings{dace,
+  author    = {Johannes de Fine Licht, Andreas Kuster, Tiziano De Matteis, Tal Ben-Nun, Dominic Hofer, Torsten Hoefler},
+  title     = {StencilFlow: Mapping Large Stencil Programs to Distributed Spatial Computing Systems},
+  year      = {2021},
+  booktitle = {Proceedings of the IEEE/ACM International Symposium on Code Generation and Optimization (CGO)},
+  series = {CGO '21}
+}
+```
diff --git a/bug_min.json b/bug_min.json
@@ -0,0 +1,84 @@
+{
+ "inputs": {
+  "inA": {
+   "data": "inA_float32.dat",
+   "data_type": "float32",
+   "input_dims": [
+    "i"
+   ]
+  }
+ },
+ "outputs": [
+  "out"
+ ],
+ "dimensions": [
+  10,
+  10,
+  10
+ ],
+ "vectorization": 1,
+ "program": {
+  "k0": {
+   "data_type": "float32",
+   "computation_string": "k0 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k1": {
+   "data_type": "float32",
+   "computation_string": "k1 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k2": {
+   "data_type": "float32",
+   "computation_string": "k2 = k1[i, j, k] + k0[i+1, j, k] + k0[i, j, k]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "k3": {
+   "data_type": "float32",
+   "computation_string": "k3 = k0[i, j, k] + k1[i+1, j+1, k+1] + k1[i, j, k]",
+   "boundary_conditions": {
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "out": {
+   "data_type": "float32",
+   "computation_string": "out = k2[i, j, k] + k3[i, j, k]",
+   "boundary_conditions": {
+    "k2":{
+       "type": "constant",
+       "value": 0.0
+    },
+    "k3": {
+       "type": "constant",
+       "value": 0.0
+    }
+    }
+   }
+ }
+}
diff --git a/bug_min_ext.json b/bug_min_ext.json
@@ -0,0 +1,94 @@
+{
+ "inputs": {
+  "inA": {
+   "data": "inA_float32.dat",
+   "data_type": "float32",
+   "input_dims": [
+    "i"
+   ]
+  }
+ },
+ "outputs": [
+  "out"
+ ],
+ "dimensions": [
+  8,
+  8,
+  8
+ ],
+ "vectorization": 1,
+ "program": {
+  "k0": {
+   "data_type": "float32",
+   "computation_string": "k0 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k1": {
+   "data_type": "float32",
+   "computation_string": "k1 = inA[i]",
+   "boundary_conditions": {
+      "inA": {
+          "type": "constant",
+          "value": 0.0
+      }
+   }
+  },
+  "k2": {
+   "data_type": "float32",
+   "computation_string": "k2 = k1[i, j, k] + k0[i + 1, j, k] + k0[i, j, k]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "k3": {
+   "data_type": "float32",
+   "computation_string": "k3 = k0[i, j, k] + k4[i + 1, j, k] + k4[i, j, k]",
+   "boundary_conditions": {
+    "k0": {
+       "type": "constant",
+       "value": 0.0
+    },
+    "k4": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+   "k4": {
+   "data_type": "float32",
+   "computation_string": "k4 = k1[i, j, k] + k1[i+1, j, k]",
+   "boundary_conditions": {
+    "k1": {
+       "type": "constant",
+       "value": 0.0
+    }
+   }
+  },
+  "out": {
+   "data_type": "float32",
+   "computation_string": "out = k2[i,j,k] + k3[i,j,k]",
+   "boundary_conditions": {
+    "k2":{
+       "type": "constant",
+       "value": 0.0
+    },
+    "k3": {
+       "type": "constant",
+       "value": 0.0
+    }
+    }
+   }
+ }
+}
diff --git a/stencilflow/kernel_chain_graph.py b/stencilflow/kernel_chain_graph.py
@@ -15,6 +15,8 @@
 import operator
 import re
 import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 
 from typing import Any, List, Dict, Tuple
 
@@ -85,6 +87,29 @@ def __init__(self,
         if self.log_level >= LogLevel.MODERATE:
             print("Compute delay buffer sizes.")
         self.compute_delay_buffer()  # compute the delay buffer sizes
+
+        for node in self.graph.nodes():
+            if node.name == "__tmp_T" or node.name == "__tmp_T_sqr_s_1351":
+                name = "u_tmp"
+                max_size = self.dimensions[0]*self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+            if node.name == "__tmp_S" or node.name == "__tmp_S_sqr_uv_1352":
+                name = "v_tmp"
+                max_size = self.dimensions[0] * self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+            if node.name == "__tmp_T_sqr_s_1351":
+                name = "ms_sdfg_1330___local_frac_1_dx_1660"
+                max_size = self.dimensions[0]*self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+            if node.name == "__tmp_S_sqr_uv_1352":
+                name = "ms_sdfg_1330___local_frac_1_dx_1660"
+                max_size = self.dimensions[0] * self.dimensions[1]
+                node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
+                node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)
+
         if self.log_level >= LogLevel.MODERATE:
             print("Add channels to the graph edges.")
         # plot kernel graphs if flag set to true
@@ -507,6 +532,11 @@ def compute_delay_buffer(self) -> None:
                         dimensions=self.dimensions,
                         index=stencilflow.list_subtract_cwise(
                             max_delay[:-1], entry[:-1]))
+
+                    if not isinstance(node, Output):
+                        max_offset = node.dist_to_center[max(node.dist_to_center, key=lambda x: node.dist_to_center[x])]
+                        max_size = max_offset - node.dist_to_center[entry[-1]]
+
                     node.delay_buffer[name] = BoundedQueue(name=name,
                                                            maxsize=max_size)
                     node.delay_buffer[name].import_data(
@@ -789,6 +819,14 @@ def runtime_lower_bound(self):
                         type=int)
     parser.add_argument("-report", action="store_true")
     parser.add_argument("-simulate", action="store_true")
+    parser.add_argument("-opt", action="store_true")
+    parser.add_argument("-opt_goal", default=["min_fast_mem", 12000], nargs="+")
+    """
+        choices:
+        - min_com_vol, FAST_MEM_BOUND, SLOW_MEM_BOUND
+        - min_fast_mem, COM_VOL_BOUND
+        - opt_ratio, RATIO
+    """
     args = parser.parse_args()
     args.log_level = stencilflow.log_level.LogLevel(args.log_level)
     program_description = stencilflow.parse_json(args.stencil_file)
@@ -809,6 +847,17 @@ def runtime_lower_bound(self):
                         log_level=LogLevel(args.log_level))
         sim.simulate()
 
+    # choose optimization goal
+    if args.opt:
+        from stencilflow import Optimizer
+        opt = Optimizer(self.kernel_nodes, self.dimensions)
+        if args.opt_goal[0] == "min_com_vol":
+            opt.minimize_comm_vol(fast_memory_bound=args.opt_goal[1], slow_memory_bound=args.opt_goal[2])
+        if args.opt_goal[0] == "min_fast_mem":
+            opt.minimize_fast_mem(communication_volume_bound=args.opt_goal[1])
+        if args.opt_goal[0] == "opt_ratio":
+            opt.optimize_to_ratio(ratio=args.opt_goal[1])
+
     # output a report if argument -report is true
     if args.report:
         chain.report(args.stencil_file)

diff --git a/test/stencils/jacobi3d_512x512x512.json b/test/stencils/jacobi3d_512x512x512.json
@@ -0,0 +1,24 @@
+{
+    "inputs": {
+        "a": {
+            "data": "data/zeros_32x32x32_fp32.dat",
+            "data_type": "float32"
+        }
+    },
+    "outputs": ["b"],
+    "dimensions": [512, 512, 512],
+    "program": {
+        "b": {
+            "computation_string":
+            "b = 0.16666666 * (a[i-1,j,k] + a[i+1,j,k] + a[i,j-1,k] + a[i,j+1,k] + a[i,j,k-1] + a[i,j,k+1])",
+            "boundary_conditions": {
+                "a": {
+                    "type": "constant",
+                    "value": 1.0
+                }
+            },
+            "data_type":
+            "float32"
+        }
+    }
+}
diff --git a/vars.sh b/vars.sh
@@ -0,0 +1,16 @@
+# intel fpga
+export INTELFPGAOCLSDKROOT=/opt/intelFPGA_pro/19.1/hld
+export PATH=$INTELFPGAOCLSDKROOT/bin/:$PATH
+export AOCL_BOARD_PACKAGE_ROOT=$INTELFPGAOCLSDKROOT/board/bittware_pcie/s10
+# /opt/intelFPGA_pro/19.4/hld/board/bittware_pcie/s10/board_env.xml
+# /opt/intelFPGA_pro/19.4/hld/board/bittware_pcie/s10_hpc_default/board_env.xml
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$AOCL_BOARD_PACKAGE_ROOT/linux64/lib
+
+# xilinx fpga
+export PATH=/opt/Xilinx/Vitis/2019.2/bin:/opt/Xilinx/Vitis_HLS/2019.2/bin:/opt/Xilinx/Vivado/2019.2/bin:$PATH
+export XILINX_XRT=/opt/xilinx/xrt
+export PATH=$XILINX_XRT/bin:$PATH
+export LD_LIBRARY_PATH=$XILINX_XRT/lib:$LD_LIBRARY_PATH
+export XILINXD_LICENSE_FILE=2100@sgv-license-01
+export LIBRARY_PATH=/usr/lib/x86_64-linux-gnu
+