Apply P0075R2

jaredhoberock · jaredhoberock · commit 832ce4b3c23c · 2017-11-17T15:11:53.000-06:00
diff --git a/algorithms.html b/algorithms.html
@@ -548,13 +548,373 @@ <h1>Header <code>&lt;experimental/algorithm&gt;</code> synopsis</h1>
   template&lt;class T&gt;
     ordered_update_t&lt;T&gt; ordered_update(T&amp; ref) noexcept;
 }
+<ins>
+
+// Exposition only: Suppress template argument deduction.
+template&lt;class T&gt; struct no_deduce { using type = T; };
+template&lt;class T&gt; struct no_dedude_t = typename no_deduce&lt;T&gt;::type;
+
+<cxx-ref insynopsis="" to="parallel.alg.reductions"></cxx-ref> Support for reductions
+template&lt;class T, class BinaryOperation&gt;
+  <em>unspecified</em> reduction(T&amp; var, const T&amp; identity, BinaryOperation combiner);
+template&lt;class T&gt;
+  <em>unspecified</em> reduction_plus(T&amp; var);
+template&lt;class T&gt;
+  <em>unspecified</em> reduction_multiplies(T&amp; var);
+template&lt;class T&gt;
+  <em>unspecified</em> reduction_bit_and(T&amp; var);
+template&lt;class T&gt;
+  <em>unspecified</em> reduction_bit_or(T&amp; var);
+template&lt;class T&gt;
+  <em>unspecified</em> reduction_bit_xor(T&amp; var);
+template&lt;class T&gt;
+  <em>unspecified</em> reduction_min(T&amp; var);
+template&lt;class T&gt;
+  <em>unspecified</em> reduction_max(T&amp; var);
+
+<cxx-ref insynopsis="" to="parallel.alg.inductions"></cxx-ref> Support for inductions
+template&lt;class T&gt;
+  <em>unspecified</em> induction(T&amp;&amp; var);
+template&lt;class T&gt;
+  <em>unspecified</em> induction(T&amp;&amp; var, S stride);
+
+<cxx-ref insynopsis="" to="parallel.alg.forloop"></cxx-ref> for_loop
+template&lt;class I, class... Rest&gt;
+  void for_loop(no_deduce_t&lt;I&gt; start, I finish, Rest&amp;&amp;... rest);
+template&lt;class ExecutionPolicy,
+         class I, class... Rest&gt;
+  void for_loop(ExecutionPolicy&amp;&amp; exec,
+                no_deduce_t&lt;I&gt; start, I finish, Rest&amp;&amp;... rest);
+template&lt;class I, class S, class... Rest&gt;
+  void for_loop_strided(no_deduce_t&lt;I&gt; start, I finish,
+                        S stride, Rest&amp;&amp;... rest);
+template&lt;class ExecutionPolicy,
+         class I, class S, class... Rest&gt;
+  void for_loop_strided(ExecutionPolicy&amp;&amp; exec,
+                        no_deduce_t&lt;I&gt; start, I finish,
+                        S stride, Rest&amp;&amp;... rest);
+template&lt;class I, class Size, class... Rest&gt;
+  void for_loop_n(I start, Size n, Rest&amp;&amp;... rest);
+template&lt;class ExecutionPolicy,
+         class I, class Size, class... Rest&gt;
+  void for_loop_n(ExecutionPolicy&amp;&amp; exec,
+                  I start, Size n, Rest&amp;&amp;... rest);
+template&lt;class I, class Size, class S, class... Rest&gt;
+  void for_loop_n_strided(I start, Size n, S stride, Rest&amp;&amp;... rest);
+template&lt;class ExecutionPolicy,
+         class I, class Size, class S, class... Rest&gt;
+  void for_loop_n_strided(ExecutionPolicy&amp;&amp; exec,
+                          I start, Size n, S stride, Rest&amp;&amp;... rest);
+</ins>
 <del>}</del>
 }
 }
 <del>}</del>
 </pre>
     </cxx-section>
 
+    <cxx-section id="parallel.alg.reductions">
+      <h1><ins>Reductions</ins></h1>
+
+      <ins>
+      <p>
+        Each of the function templates in this subclause ([parallel.alg.reductions]) returns a <em>reduction object</em>
+        of unspecified type having a <em>reduction value type</em> and encapsulating a <em>reduction identity</em> value for the reduction, a
+        <em>combiner</em> function object, and a <em>live-out object</em> from which the initial value is obtained and into which the final
+        value is stored.
+      </p>
+
+      <p>
+        An algorithm uses reduction objects by allocating an unspecified number of instances, known as <em>accumulators</em>, of the reduction value
+        type. <cxx-note>An implementation might, for example, allocate an accumulator for each thread in its private thread pool.</cxx-note>
+        Each accumulator is initialized with the object’s reduction identity, except that the live-out object (which was initialized by the
+        caller) comprises one of the accumulators. The algorithm passes a reference to an accumulator to each application of an element-access
+        function, ensuring that no two concurrently executing invocations share the same accumulator. An accumulator can be shared between two
+        applications that do not execute concurrently, but initialization is performed only once per accumulator.
+      </p>
+
+      <p>
+        Modifications to the accumulator by the application of element access functions accrue as partial results. At some point before the algorithm
+        returns, the partial results are combined, two at a time, using the reduction object’s combiner operation until a single value remains, which
+        is then assigned back to the live-out object. <cxx-note> in order to produce useful results, modifications to the accumulator should be limited
+        to commutative operations closely related to the combiner operation. For example if the combiner is <code>plus&lt;T&gt;</code>, incrementing
+        the accumulator would be consistent with the combiner but doubling it or assigning to it would not.</cxx-note>
+      </p>
+      </ins>
+
+      <cxx-function>
+        <cxx-signature><ins>template&lt;class T, class BinaryOperation&gt;
+<em>unspecified</em> reduction(T&amp; var, const T&amp; identity, BinaryOperation combiner);</ins></cxx-signature>
+
+        <ins>
+        <cxx-requires><ins>T shall meet the requirements of <code>CopyConstructible</code> and <code>MoveAssignable</code>. The expression <code>var = combiner(var, var)</code> shall be well-formed.</ins></cxx-requires>
+        </ins>
+
+        <ins>
+        <cxx-returns><ins>a reduction object of unspecified type having reduction value type <code>T</code>, reduction identity <code>identity</code>, combiner function object <code>combiner</code>, and using the object referenced by <code>var</code> as its live-out object.</ins></cxx-returns>
+        </ins>
+      </cxx-function>
+
+      <cxx-function>
+        <cxx-signature><ins>template&lt;class T&gt;
+<em>unspecified</em> reduction_plus(T&amp; var);</ins></cxx-signature>
+        <cxx-signature><ins>template&lt;class T&gt;
+<em>unspecified</em> reduction_multiplies(T&amp; var);</ins></cxx-signature>
+        <cxx-signature><ins>template&lt;class T&gt;
+<em>unspecified</em> reduction_bit_and(T&amp; var);</ins></cxx-signature>
+        <cxx-signature><ins>template&lt;class T&gt;
+<em>unspecified</em> reduction_bit_or(T&amp; var);</ins></cxx-signature>
+        <cxx-signature><ins>template&lt;class T&gt;
+<em>unspecified</em> reduction_bit_xor(T&amp; var);</ins></cxx-signature>
+        <cxx-signature><ins>template&lt;class T&gt;
+<em>unspecified</em> reduction_min(T&amp; var);</ins></cxx-signature>
+        <cxx-signature><ins>template&lt;class T&gt;
+<em>unspecified</em> reduction_max(T&amp; var);</ins></cxx-signature>
+
+        <ins>
+          <cxx-requires><ins>T shall meet the requirements of <code>CopyConstructible</code> and <code>MoveAssignable</code>.</ins></cxx-requires>
+        </ins>
+
+        <ins>
+          <cxx-returns><ins>a reduction object of unspecified type having reduction value type <code>T</code>, reduction identity and combiner operation as specified in table <cxx-ref to="reduction-identities-and-combiner-operations"></cxx-ref> and using the object referenced by <code>var</code> as its live-out object.</ins></cxx-returns>
+        </ins>
+
+        <table is="cxx-table" class="column-rules" id=reduction-identities-and-combiner-operations>
+          <caption><ins>Reduction identities and combiner operations</ins></caption>
+          <thead>
+            <tr>
+              <th><ins>Function</ins></th>
+              <th><ins>Reduction Identity</ins></th>
+              <th><ins>Combiner Operation</ins></th>
+            </tr>
+            <tr>
+              <th><ins><code>reduction_plus</code></ins></th>
+              <th><ins><code>T()</code></ins></th>
+              <th><ins><code>x + y</code></ins></th>
+            </tr>
+            <tr>
+              <th><ins><code>reduction_multiplies</code></ins></th>
+              <th><ins><code>T(1)</code></ins></th>
+              <th><ins><code>x * y</code></ins></th>
+            </tr>
+            <tr>
+              <th><ins><code>reduction_bit_and</code></ins></th>
+              <th><ins><code>(~T())</code></ins></th>
+              <th><ins><code>X &amp; y</code></ins></th>
+            </tr>
+            <tr>
+              <th><ins><code>reduction_bit_or</code></ins></th>
+              <th><ins><code>T()</code></ins></th>
+              <th><ins><code>x | y</code></ins></th>
+            </tr>
+            <tr>
+              <th><ins><code>reduction_bit_xor</code></ins></th>
+              <th><ins><code>T()</code></ins></th>
+              <th><ins><code>x ^ y</code></ins></th>
+            </tr>
+            <tr>
+              <th><ins><code>reduction_min</code></ins></th>
+              <th><ins><code>var</code></ins></th>
+              <th><ins><code>min(x, y)</code></ins></th>
+            </tr>
+            <tr>
+              <th><ins><code>reduction_max</code></ins></th>
+              <th><ins><code>var</code></ins></th>
+              <th><ins><code>max(x, y)</code></ins></th>
+            </tr>
+          </thead>
+        </table>
+
+        <ins>
+        <cxx-example><ins>The following code updates each element of <code>y</code> and sets <code>s</code> ot the sum of the squares.
+<pre>
+extern int n;
+extern float x[], y[], a;
+float s = 0;
+for_loop(execution::vec, 0, n,
+    reduction(s, 0.0f, plus&lt;&gt;()),
+    [&amp;](int i, float&amp; accum) {
+            y[i] += a*x[i];
+            accum += y[i]*y[i];
+    }
+);
+</pre>
+        </ins></cxx-example>
+        </ins>
+      </cxx-function>
+    </cxx-section>
+
+    <cxx-section id="parallel.alg.inductions">
+      <h1><ins>Inductions</ins></h1>
+
+      <ins>
+      <p>
+        Each of the function templates in this section return an <em>induction object</em> of unspecified type having an <em>induction
+        value type</em> and encapsulating an initial value <em>i</em> of that type and, optionally, a <em>stride</em>.
+      </p>
+
+      <p>
+        For each element in the input range, an algorithm over input sequence <em>S</em> computes an <em>induction value</em> from an induction variable
+        and ordinal position <em>p</em> within <em>S</em> by the formula <em>i + p * stride</em> if a stride was specified or <em>i + p</em> otherwise. This induction value is
+        passed to the element access function.
+      </p>
+
+      <p>
+        An induction object may refer to a <em>live-out</em> object to hold the final value of the induction sequence. When the algorithm using the induction
+        object completes, the live-out object is assigned the value <em>i + n * stride</em>, where <em>n</em> is the number of elements in the input range.
+      </p>
+      </ins>
+
+      <cxx-function>
+        <cxx-signature><ins>template&lt;class T&gt;
+<em>unspecified</em> induction(T&amp;&amp; var);</ins></cxx-signature>
+
+        <cxx-signature><ins>template&lt;class T, class S&gt;
+<em>unspecified</em> induction(T&amp;&amp; var, S stride);</ins></cxx-signature>
+
+        <ins>
+          <cxx-returns>
+            <ins>
+            an induction object with induction value type <code>remove_cv_t&gt;remove_reference_t&gt;T&lt;&lt;</code>,
+            initial value <code>var</code>, and (if specified) stride <code>stride</code>. If <code>T</code> is an lvalue reference
+            to non-<code>const</code> type, then the object referenced by <code>var</code> becomes the live-out object for the
+            induction object; otherwise there is no live-out object.
+            </ins>
+          </cxx-returns>
+        </ins>
+      </cxx-function>
+    </cxx-section>
+
+    <cxx-section id="parallel.alg.forloop">
+      <h1><ins>For loop</ins></h1>
+
+      <cxx-function>
+        <cxx-signature><ins>template&lt;class I, class... Rest&gt;
+void for_loop(no_deduce_t&lt;I&gt; start, I finish, Rest&amp;&amp;... rest);</ins></cxx-signature>
+
+        <cxx-signature><ins>template&lt;class ExecutionPolicy,
+      class I, class... Rest&gt;
+void for_loop(ExecutionPolicy&amp;&amp; exec,
+              no_deduce_t&lt;I&gt; start, I finish, Rest&amp;&amp;... rest);
+
+</ins></cxx-signature>
+
+        <cxx-signature><ins>template&lt;class I, class S, class... Rest&gt;
+void for_loop_strided(no_deduce_t&lt;I&gt; start, I finish,
+                      S stride, Rest&amp;&amp;... rest);</ins></cxx-signature>
+
+        <cxx-signature><ins>template&lt;class ExecutionPolicy,
+      class I, class S, class... Rest&gt;
+void for_loop_strided(ExecutionPolicy&amp;&amp; exec,
+                      no_deduce_t&lt;I&gt; start, I finish,
+                      S stride, Rest&amp;&amp;... rest);
+
+</ins></cxx-signature>
+
+        <cxx-signature><ins>template&lt;class I, class Size, class... Rest&gt;
+void for_loop_n(I start, Size n, Rest&amp;&amp;... rest);</ins></cxx-signature>
+
+        <cxx-signature><ins>template&lt;class ExecutionPolicy,
+      class I, class Size, class... Rest&gt;
+void for_loop_n(ExecutionPolicy&amp;&amp; exec,
+                I start, Size n, Rest&amp;&amp;... rest);
+          
+</ins></cxx-signature>
+
+        <cxx-signature><ins>template&lt;class I, class Size, class S, class... Rest&gt;
+void for_loop_n_strided(I start, Size n, S stride, Rest&amp;&amp;... rest);</ins></cxx-signature>
+
+        <cxx-signature><ins>template&lt;class ExecutionPolicy, 
+      class I, class Size, class S, class... Rest&gt;
+void for_loop_n_strided(ExecutionPolicy&amp;&amp; exec,
+                        I start, Size n, S stride, Rest&amp;&amp;... rest);</ins></cxx-signature>
+
+        <ins>
+        <cxx-requires>
+          <ins>
+          For the overloads with an <code>ExecutionPolicy</code>, <code>I</code> shall be an integral type
+          or meet the requirements of a forward iterator type; otherwise, <code>I</code> shall be an integral
+          type or meet the requirements of an input iterator type. <code>Size</code> shall be an integral type
+          and <code>n</code> shall be non-negative. <code>S</code> shall have integral type and <code>stride</code>
+          shall have non-zero value. <code>stride</code> shall be negative only if <code>I</code> has integral
+          type or meets the requirements of a bidirectional iterator. The <code>rest</code> parameter pack shall
+          have at least one element, comprising objects returned by invocations of <code>reduction</code>
+          ([parallel.alg.reduction]) and/or <code>induction</code> ([parallel.alg.induction]) function templates
+          followed by exactly one invocable element-access function, <em>f</em>. For the overloads with an
+          <code>ExecutionPolicy</code>, <em>f</em> shall meet the requirements of <code>CopyConstructible</code>;
+          otherwise, <em>f</em> shall meet the requirements of <code>MoveConstructible</code>.
+          </ins>
+        </cxx-requires>
+        </ins>
+
+        <ins>
+        <cxx-effects>
+          <ins>
+            Applies <em>f</em> to each element in the <em>input sequence</em>, as described below, with additional
+            arguments corresponding to the reductions and inductions in the <code>rest</code> parameter pack. The
+            length of the input sequence is:
+
+            <ul>
+              <li>
+                <code>n</code>, if specified,
+              </li>
+
+              <li>
+                otherwise <code>finish - start</code> if neither <code>n</code> nor <code>stride</code> is specified,
+              </li>
+
+              <li>
+                otherwise <code>1 + (finish-start-1)/stride</code> if <code>stride</code> is positive,
+              </li>
+
+              <li>
+                otherwise <code>1 + (start-finish-1)/-stride</code>.
+              </li>
+            </ul>
+
+            The first element in the input sequence is <code>start</code>. Each subsequent element is generated by adding
+            <code>stride</code> to the previous element, if <code>stride</code> is specified, otherwise by incrementing
+            the previous element. <cxx-note>As described in the C++ standard, section [algorithms.general], arithmetic
+            on non-random-access iterators is performed using advance and distance.</cxx-note> <cxx-note>The order of the
+            elements of the input sequence is important for determining ordinal position of an application of <em>f</em>,
+            even though the applications themselves may be unordered.</cxx-note></p>
+
+            The first argument to <em>f</em> is an element from the input sequence. <cxx-note>if <code>I</code> is an
+            iterator type, the iterators in the input sequence are not dereferenced before
+            being passed to <em>f</em>.</cxx-note> For each member of the rest parameter pack
+            excluding <em>f</em>, an additional argument is passed to each application of <em>f</em> as follows:
+
+            <ul>
+              <li>
+                If the pack member is an object returned by a call to a reduction function listed in section
+                [parallel.alg.reductions], then the additional argument is a reference to an accumulator of that reduction
+                object.
+              </li>
+
+              <li>
+                If the pack member is an object returned by a call to <code>induction</code>, then the additional argument is the
+                induction value for that induction object corresponding to the position of the application of <em>f</em> in the input
+                sequence.
+              </li>
+            </ul>
+          </ins>
+        </cxx-effects>
+        </ins>
+
+        <ins>
+        <cxx-complexity>
+          <ins>Applies <em>f</em> exactly once for each element of the input sequence.</ins>
+        </cxx-complexity>
+        </ins>
+
+        <ins>
+          <cxx-remarks>
+            <ins>If <em>f</em> returns a result, the result is ignored.</ins>
+          </cxx-remarks>
+        </ins>
+      </cxx-function>
+    </cxx-section>
+
     <cxx-section id="parallel.alg.foreach">
       <h1><del>For each</del></h1>
 
diff --git a/general.html b/general.html
@@ -80,6 +80,14 @@ <h1>Feature-testing recommendations</h1>
             <code>&lt;experimental/execution&gt;</code><br>
           </td>
         </tr>
+        <tr>
+          <td><ins>P0075R2</ins></td>
+          <td><ins>Template Library for Parallel For Loops</ins></td>
+          <td><cxx-ref to="parallel.alg.reductions"</cxx-ref>, <cxx-ref to="parallel.alg.inductions"</cxx-ref>, <cxx-ref to="parallel.alg.forloop"</cxx-ref></td>
+          <td><code><ins>__cpp_lib_experimental_parallel_for_loop</ins></code></td>
+          <td><ins>201711</ins></td>
+          <td><code><ins>&lt;experimental/algorithm&gt;</ins></code></td>
+        </tr>
       </thead>
     </table>
   </cxx-section>