@@ -118,7 +118,8 @@ static void printUsageInfo() {
118118 cout << endl;
119119 printFlag (" s=\" <command>(<params>)\" " ,
120120 " Specify a scheduling command to apply to the generated code. "
121- " Parameters take the form of a comma-delimited list. "
121+ " Parameters take the form of a comma-delimited list. See "
122+ " -help=scheduling for a list of scheduling commands. "
122123 " Examples: split(i,i0,i1,16), precompute(A(i,j)*x(j),i,i)." );
123124 cout << endl;
124125 printFlag (" c" ,
@@ -193,6 +194,74 @@ static void printUsageInfo() {
193194 printFlag (" nthreads" , " Specify number of threads for parallel execution" );
194195 cout << endl;
195196 printFlag (" prefix" , " Specify a prefix for generated function names" );
197+ cout << endl;
198+ printFlag (" help" , " Print this usage information." );
199+ cout << endl;
200+ printFlag (" help=scheduling" ,
201+ " Print information on the scheduling directives that can be passed "
202+ " to '-s'." );
203+ }
204+
205+ static void printSchedulingHelp () {
206+ cout << " Scheduling commands modify the execution of the index expression." << endl;
207+ cout << " The '-s' parameter specifies one or more scheduling commands." << endl;
208+ cout << " Schedules are additive; more commands can be passed by separating" << endl;
209+ cout << " them with commas, or passing multiple '-s' parameters." << endl;
210+ cout << endl;
211+ cout << " Examples:" << endl;
212+ cout << " -s=\" precompute(A(i,j)*x(j),i,i)\" " << endl;
213+ cout << " -s=\" split(i,i0,i1,32),parallelize(i0,CPUThread,NoRaces)\" " << endl;
214+ cout << endl;
215+ cout << " See http://tensor-compiler.org/docs/scheduling/index.html for more examples." << endl;
216+ cout << endl;
217+ cout << " Commands:" << endl;
218+ printFlag (" s=pos(i, ipos, tensor)" , " Takes in an index variable `i` "
219+ " that iterates over the coordinate space of `tensor` and replaces "
220+ " it with a derived index variable `ipos` that iterates over the "
221+ " same iteration range, but with respect to the the position space. "
222+ " The `pos` transformation is not valid for dense level formats." );
223+ cout << endl;
224+ printFlag (" s=fuse(i, j, f)" , " Takes in two index variables `i` and `j`, where "
225+ " `j` is directly nested under `i`, and collapses them into a fused "
226+ " index variable `f` that iterates over the product of the "
227+ " coordinates `i` and `j`." );
228+ cout << endl;
229+ printFlag (" s=split(i, i0, i1, factor)" , " Splits (strip-mines) an index "
230+ " variable `i` into two nested index variables `i0` and `i1`. The "
231+ " size of the inner index variable `i1` is then held constant at "
232+ " `factor`, which must be a positive integer." );
233+ cout << endl;
234+ printFlag (" s=precompute(expr, i, iw)" , " Leverages scratchpad memories and "
235+ " reorders computations to increase locality. Given a subexpression "
236+ " `expr` to precompute, an index variable `i` to precompute over, "
237+ " and an index variable `iw` (which can be the same or different as "
238+ " `i`) to precompute with, the precomputed results are stored in a "
239+ " temporary tensor variable." );
240+ cout << endl;
241+ printFlag (" s=reorder(i1, i2, ...)" , " Takes in a new ordering for a "
242+ " set of index variables in the expression that are directly nested "
243+ " in the iteration order. The indexes are ordered from outermost "
244+ " to innermost." );
245+ cout << endl;
246+ printFlag (" s=bound(i, ib, b, type)" , " Replaces an index variable `i` "
247+ " with an index variable `ib` that obeys a compile-time constraint "
248+ " on its iteration space, incorporating knowledge about the size or "
249+ " structured sparsity pattern of the corresponding input. The "
250+ " meaning of `b` depends on the `type`. Possible bound types are: "
251+ " MinExact, MinConstraint, MaxExact, MaxConstraint." );
252+ cout << endl;
253+ printFlag (" s=unroll(index, factor)" , " Unrolls the loop corresponding to an "
254+ " index variable `i` by `factor` number of iterations, where "
255+ " `factor` is a positive integer." );
256+ cout << endl;
257+ printFlag (" s=parallelize(i, u, strat)" , " tags an index variable `i` for "
258+ " parallel execution on hardware type `u`. Data races are handled by "
259+ " an output race strategy `strat`. Since the other transformations "
260+ " expect serial code, parallelize must come last in a series of "
261+ " transformations. Possible parallel hardware units are: "
262+ " NotParallel, GPUBlock, GPUWarp, GPUThread, CPUThread, CPUVector. "
263+ " Possible output race strategies are: "
264+ " IgnoreRaces, NoRaces, Atomics, Temporary, ParallelReduction." );
196265}
197266
198267static int reportError (string errorMessage, int errorCode) {
@@ -536,7 +605,15 @@ int main(int argc, char* argv[]) {
536605 if (argparts.size () == 2 )
537606 argValue = argparts[1 ];
538607
539- if (" -f" == argName) {
608+ if (" -help" == argName) {
609+ if (argValue == " scheduling" ) {
610+ printSchedulingHelp ();
611+ } else {
612+ printUsageInfo ();
613+ }
614+ return 0 ;
615+ }
616+ else if (" -f" == argName) {
540617 vector<string> descriptor = util::split (argValue, " :" );
541618 if (descriptor.size () < 2 || descriptor.size () > 4 ) {
542619 return reportError (" Incorrect format descriptor" , 4 );
0 commit comments