@@ -133,9 +133,7 @@ T S2_hard_OpenMP_thread(uint128_t x,
133133
134134 BitSieve sieve (segment_size);
135135 Wheel wheel (primes, max_b + 1 , low);
136- phi.clear ();
137136 phi.resize (max_b + 1 , 0 );
138- mu_sum.clear ();
139137 mu_sum.resize (max_b + 1 , 0 );
140138 BinaryIndexedTree tree;
141139
@@ -341,58 +339,46 @@ S2_hard_OpenMP_master(X x,
341339 int64_t min_segment_size = loadBalancer.get_min_segment_size ();
342340 int64_t segment_size = min_segment_size;
343341 int64_t segments_per_thread = 1 ;
344- int64_t segments = 0 ;
345-
346- aligned_vector<vector<int128_t >> phi (threads);
347- aligned_vector<vector<res_t >> mu_sum (threads);
348- aligned_vector<double > timings (threads);
349342
350343 PiTable pi (max_prime);
351344 vector<int128_t > phi_total (pi[isqrt (z)] + 1 , 0 );
352345 double alpha = get_alpha (x, y);
353346
354- # pragma omp parallel num_threads(threads) reduction(+: s2_hard )
347+ while (low < limit )
355348 {
356- int t = omp_get_thread_num ();
349+ int64_t segments = ceil_div (limit - low, segment_size);
350+ threads = in_between (1 , threads, segments);
351+ segments_per_thread = in_between (1 , segments_per_thread, ceil_div (segments, threads));
357352
358- while (low < limit)
359- {
360- #pragma omp single
361- {
362- segments = ceil_div (limit - low, segment_size);
363- threads = in_between (1 , threads, segments);
364- segments_per_thread = in_between (1 , segments_per_thread, ceil_div (segments, threads));
365- }
353+ aligned_vector<vector<int128_t >> phi (threads);
354+ aligned_vector<vector<res_t >> mu_sum (threads);
355+ aligned_vector<double > timings (threads);
366356
367- if (t < threads)
368- {
369- timings[t] = get_wtime ();
370- s2_hard += S2_hard_OpenMP_thread (x, y, z, c, segment_size, segments_per_thread,
371- t, low, limit, alpha, factors, pi, primes, mu_sum[t], phi[t]);
372- timings[t] = get_wtime () - timings[t];
373- }
357+ #pragma omp parallel for num_threads(threads) reduction(+: s2_hard)
358+ for (int i = 0 ; i < threads; i++)
359+ {
360+ timings[i] = get_wtime ();
361+ s2_hard += S2_hard_OpenMP_thread (x, y, z, c, segment_size, segments_per_thread,
362+ i, low, limit, alpha, factors, pi, primes, mu_sum[i], phi[i]);
363+ timings[i] = get_wtime () - timings[i];
364+ }
374365
375- #pragma omp barrier
376- #pragma omp single
366+ // Once all threads have finished reconstruct and add the
367+ // missing contribution of all special leaves. This must
368+ // be done in order as each thread (i) requires the sum of
369+ // the phi values from the previous threads.
370+ //
371+ for (int i = 0 ; i < threads; i++)
372+ {
373+ for (size_t j = 1 ; j < phi[i].size (); j++)
377374 {
378- // Once all threads have finished reconstruct and add the
379- // missing contribution of all special leaves. This must
380- // be done in order as each thread requires the sum of the
381- // phi values from the previous threads.
382- //
383- for (int i = 0 ; i < threads; i++)
384- {
385- for (size_t j = 1 ; j < phi[i].size (); j++)
386- {
387- s2_hard += mu_sum[i][j] * phi_total[j];
388- phi_total[j] += phi[i][j];
389- }
390- }
391-
392- low += segments_per_thread * threads * segment_size;
393- loadBalancer.update (low, threads, &segment_size, &segments_per_thread, timings);
375+ s2_hard += mu_sum[i][j] * phi_total[j];
376+ phi_total[j] += phi[i][j];
394377 }
395378 }
379+
380+ low += segments_per_thread * threads * segment_size;
381+ loadBalancer.update (low, threads, &segment_size, &segments_per_thread, timings);
396382 }
397383
398384 return s2_hard;
0 commit comments