preda
diff --git a/‎src/TrigBufCache.cpp‎
Lines changed: 160 additions & 6 deletions b/‎src/TrigBufCache.cpp‎
Lines changed: 160 additions & 6 deletions
diff --git a/‎src/cl/base.cl‎
Lines changed: 2 additions & 0 deletions b/‎src/cl/base.cl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cl/carryfused.cl‎
Lines changed: 29 additions & 6 deletions b/‎src/cl/carryfused.cl‎
Lines changed: 29 additions & 6 deletions
@@ -14,6 +14,9 @@
 #define SINGLE_WIDE             0       // Old single-wide tailSquare vs. new double-wide tailSquare
 #define SINGLE_KERNEL           0       // Implement tailSquare in a single kernel vs. two kernels
 
+#define SAVE_ONE_MORE_WIDTH_MUL  0      // I want to make saving the only option -- but rocm optimizer is inexplicably making it slower in carryfused
+#define SAVE_ONE_MORE_HEIGHT_MUL 1      // In tailSquar this is the fastest option
+
 #define _USE_MATH_DEFINES
 #include <cmath>
 
@@ -93,23 +96,100 @@ double2 root1(u32 N, u32 k) {
 namespace {
 static const constexpr bool LOG_TRIG_ALLOC = false;
 
+// Interleave two lines of trig values so that AMD GPUs can use global_load_dwordx4 instructions
+void T2shuffle(u32 size, u32 radix, u32 line, vector<double> &tab) {
+  vector<double> line1, line2;
+  u32 line_size = size / radix;
+  for (u32 col = 0; col < line_size; ++col) {
+    line1.push_back(tab[line*line_size + col]);
+    line2.push_back(tab[(line+1)*line_size + col]);
+  }
+  for (u32 col = 0; col < line_size; ++col) {
+    tab[line*line_size + 2*col] = line1[col];
+    tab[line*line_size + 2*col + 1] = line2[col];
+  }
+}
+
 vector<double2> genSmallTrig(u32 size, u32 radix) {
   if (LOG_TRIG_ALLOC) { log("genSmallTrig(%u, %u)\n", size, radix); }
 
   vector<double2> tab;
-#if 1
+// old fft_WIDTH
   for (u32 line = 1; line < radix; ++line) {
     for (u32 col = 0; col < size / radix; ++col) {
       tab.push_back(radix / line >= 8 ? root1Fancy(size, col * line) : root1(size, col * line));
     }
   }
   tab.resize(size);
-#else
-  tab.resize(size);
-  auto *p = tab.data() + radix;
-  for (u32 w = radix; w < size; w *= radix) { p = smallTrigBlock(w, std::min(radix, size / w), p); }
-  assert(p - tab.data() == size);
+
+// New fft_WIDTH
+  vector<double> tab1;
+  // Epsilon value, 2^-250, should have an exact representation as a double
+  const double epsilon = 5.5271478752604445602472651921923E-76;  // Protect against divide by zero
+  // Sine/cosine values for first fft8
+//TO DO: explore using long doubles through the division (though dividing by double(cosine) may make sense)
+  for (u32 line = 1; line < radix; ++line) {
+    for (u32 col = 0; col < size / radix; ++col) {
+      double2 root = root1(size, col * line); root.first += epsilon;
+      tab1.push_back(root.second / root.first);
+    }
+  }
+  // Interleave trig values for faster AMD GPU access
+  for (u32 i = 0; i < 6; i += 2) T2shuffle(size, radix, i, tab1);
+  // Sine/cosine values for second fft8
+  for (u32 line = 0; line < radix; ++line) {
+    for (u32 col = 0; col < size / radix / 8; ++col) {
+      double2 root = root1(size, 8 * col * line); root.first += epsilon;
+      tab1.push_back(root.second / root.first);
+    }
+  }
+  // Cosine values for first fft8 (output in post-shufl order)
+//TODO: Examine why when sine is 0.0 cosine is not 1.0 or -1.0 (printf is outputting 0.999... and -0.999...)
+  for (u32 col = 0; col < size / radix; ++col) {  // col 0..7 will be line0, col 8..15 will be line1, etc.
+    for (u32 line = 0; line < radix; ++line) {
+      double2 root = root1(size, col * line); root.first += epsilon;
+#if SAVE_ONE_MORE_WIDTH_MUL
+      if (col / 8 == 3) { // Compute cosine3 / cosine1
+        root.first /= root1(size, (col - 16) * line).first + epsilon;
+      }
+      if (col / 8 == 5) { // Compute cosine5 / cosine1
+        root.first /= root1(size, (col - 32) * line).first + epsilon;
+      }
 #endif
+      if (col / 8 == 6) { // Compute cosine6 / cosine2
+        root.first /= root1(size, (col - 32) * line).first + epsilon;
+      }
+      if (col / 8 == 7) { // Compute cosine7 / cosine3
+        root.first /= root1(size, (col - 32) * line).first + epsilon;
+      }
+      tab1.push_back(root.first);
+    }
+  }
+  // Interleave trig values for faster AMD GPU access
+  for (u32 i = 8; i < 16; i += 2) T2shuffle(size, radix, i, tab1);
+  // Cosine values for second fft8 (output in post-shufl order)
+  for (u32 col = 0; col < size / radix / 8; ++col) {
+    for (u32 line = 0; line < radix; ++line) {
+      double2 root = root1(size, 8 * col * line); root.first += epsilon;
+#if SAVE_ONE_MORE_WIDTH_MUL
+      if (col == 3) { // Compute cosine3 / cosine1
+        root.first /= root1(size, 8 * (col - 2) * line).first + epsilon;
+      }
+      if (col == 5) { // Compute cosine5 / cosine1
+        root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
+      }
+#endif
+      if (col == 6) { // Compute cosine6 / cosine2
+        root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
+      }
+      if (col == 7) { // Compute cosine7 / cosine3
+        root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
+      }
+      tab1.push_back(root.first);
+    }
+  }
+  // Convert to a vector of double2
+  for (u32 i = 0; i < tab1.size(); i += 2) tab.push_back({tab1[i], tab1[i+1]});
 
   return tab;
 }
@@ -119,11 +199,85 @@ vector<double2> genSmallTrigCombo(u32 width, u32 middle, u32 size, u32 radix) {
   if (LOG_TRIG_ALLOC) { log("genSmallTrigCombo(%u, %u)\n", size, radix); }
 
   vector<double2> tab;
+// old fft_HEIGHT
   for (u32 line = 1; line < radix; ++line) {
     for (u32 col = 0; col < size / radix; ++col) {
       tab.push_back(radix / line >= 8 ? root1Fancy(size, col * line) : root1(size, col * line));
     }
   }
+  tab.resize(size);
+
+// New fft_HEIGHT
+  vector<double> tab1;
+  // Epsilon value, 2^-250, should have an exact representation as a double
+  const double epsilon = 5.5271478752604445602472651921923E-76;  // Protect against divide by zero
+  // Sine/cosine values for first fft8
+//TO DO: explore using long doubles through the division (though dividing by double(cosine) may make sense)
+  for (u32 line = 1; line < radix; ++line) {
+    for (u32 col = 0; col < size / radix; ++col) {
+      double2 root = root1(size, col * line); root.first += epsilon;
+      tab1.push_back(root.second / root.first);
+    }
+  }
+  // Interleave trig values for faster AMD GPU access
+  for (u32 i = 0; i < 6; i += 2) T2shuffle(size, radix, i, tab1);
+  // Sine/cosine values for second fft8
+  for (u32 line = 0; line < radix; ++line) {
+    for (u32 col = 0; col < size / radix / 8; ++col) {
+      double2 root = root1(size, 8 * col * line); root.first += epsilon;
+      tab1.push_back(root.second / root.first);
+    }
+  }
+  // Cosine values for first fft8 (output in post-shufl order)
+//TODO: Examine why when sine is 0.0 cosine is not 1.0 or -1.0 (printf is outputting 0.999... and -0.999...)
+  for (u32 col = 0; col < size / radix; ++col) {  // col 0..7 will be line0, col 8..15 will be line1, etc.
+    for (u32 line = 0; line < radix; ++line) {
+      double2 root = root1(size, col * line); root.first += epsilon;
+#if SAVE_ONE_MORE_HEIGHT_MUL
+      if (col / 8 == 3) { // Compute cosine3 / cosine1
+        root.first /= root1(size, (col - 16) * line).first + epsilon;
+      }
+      if (col / 8 == 5) { // Compute cosine5 / cosine1
+        root.first /= root1(size, (col - 32) * line).first + epsilon;
+      }
+#endif
+      if (col / 8 == 6) { // Compute cosine6 / cosine2
+        root.first /= root1(size, (col - 32) * line).first + epsilon;
+      }
+      if (col / 8 == 7) { // Compute cosine7 / cosine3
+        root.first /= root1(size, (col - 32) * line).first + epsilon;
+      }
+      tab1.push_back(root.first);
+    }
+  }
+  // Interleave trig values for faster AMD GPU access
+  for (u32 i = 8; i < 16; i += 2) T2shuffle(size, radix, i, tab1);
+  // Cosine values for second fft8 (output in post-shufl order)
+  for (u32 col = 0; col < size / radix / 8; ++col) {
+    for (u32 line = 0; line < radix; ++line) {
+      double2 root = root1(size, 8 * col * line); root.first += epsilon;
+#if SAVE_ONE_MORE_HEIGHT_MUL
+      if (col == 3) { // Compute cosine3 / cosine1
+        root.first /= root1(size, 8 * (col - 2) * line).first + epsilon;
+      }
+      if (col == 5) { // Compute cosine5 / cosine1
+        root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
+      }
+#endif
+      if (col == 6) { // Compute cosine6 / cosine2
+        root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
+      }
+      if (col == 7) { // Compute cosine7 / cosine3
+        root.first /= root1(size, 8 * (col - 4) * line).first + epsilon;
+      }
+      tab1.push_back(root.first);
+    }
+  }
+  // Convert to a vector of double2
+  for (u32 i = 0; i < tab1.size(); i += 2) tab.push_back({tab1[i], tab1[i+1]});
+
+  tab.resize(size*4);
+
   // From tailSquare pre-calculate some or all of these:  T2 trig = slowTrig_N(line + H * lowMe, ND / NH * 2);
 #if PREFER_DP_TO_MEM == 2             // No pre-computed trig values
 #elif PREFER_DP_TO_MEM == 1           // best option on a Radeon VII.
 
@@ -145,8 +145,10 @@ typedef double2 T2;
 // For reasons unknown, loading trig values into nVidia's constant cache has terrible performance
 #if AMDGPU
 typedef constant const T2* Trig;
+typedef constant const T* TrigSingle;
 #else
 typedef global const T2* Trig;
+typedef global const T* TrigSingle;
 #endif
 // However, caching weights in nVidia's constant cache improves performance.
 // Even better is to not pollute the constant cache with weights that are used only once.
 
@@ -29,10 +29,14 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   u32 line = gr % H;
 
   T2 u[NW];
-  
+
   readCarryFusedLine(in, u, line);
 
-  // Split 32 bits into NW groups of 2 bits.  See later for different way to do this.
+#if HAS_ASM
+  __asm("s_setprio 3");
+#endif
+
+// Split 32 bits into NW groups of 2 bits.  See later for different way to do this.
 #if !BIGLIT
 #define GPW (16 / NW)
   u32 b = bits[(G_W * line + me) / GPW] >> (me % GPW * (2 * NW));
@@ -43,7 +47,18 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
 // common sub-expressions to re-use in the second fft_WIDTH call.  Re-using this data requires dozens of VGPRs
 // which causes a terrible reduction in occupancy.
 //  fft_WIDTH(lds + (get_group_id(0) / 131072), u, smallTrig + (get_group_id(0) / 131072));
-  fft_WIDTH(lds, u, smallTrig);
+
+// A temporary hack until we figure out which combinations we want to finally offer:
+// UNROLL_W=0: old fft_WIDTH, no loop unrolling
+// UNROLL_W=1: old fft_WIDTH, loop unrolling
+// UNROLL_W=2: old fft_WIDTH, loop unrolling with "hidden zero" hack to thwart rocm optimizer.  I'm seeing this as best R7Pro option.
+// UNROLL_W=3: new fft_WIDTH if applicable, hidden zero hack.  Slightly better on Radeon VII -- more study needed as to why results weren't better.
+// UNROLL_W=4: new fft_WIDTH if applicable, no hidden zero hack.  Best on Titan V.
+#if UNROLL_W == 2 || UNROLL_W == 3
+  new_fft_WIDTH1(lds + (get_group_id(0) / 131072), u, smallTrig + (get_group_id(0) / 131072));
+#else
+  new_fft_WIDTH1(lds, u, smallTrig);
+#endif
 
   Word2 wu[NW];
 #if AMDGPU
@@ -86,7 +101,7 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
   }
 
   // On Titan V it is faster to derive the big vs. little flags from the fractional number of bits in each FFT word rather read the flags from memory.
-  // On Radeon VII this code is about he same speed.  Not sure which is better on other GPUs.
+  // On Radeon VII this code is about the same speed.  Not sure which is better on other GPUs.
 #if BIGLIT
   // Calculate the most significant 32-bits of FRAC_BPW * the index of the FFT word.  Also add FRAC_BPW_HI to test first biglit flag.
   u32 fft_word_index = (me * H + line) * 2;
@@ -147,10 +162,16 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
     }
   }
   if (gr == 0) { return; }
+#if HAS_ASM
+  __asm("s_setprio 0");
+#endif
   u32 pos = (gr - 1) * (G_W / WAVEFRONT) + me / WAVEFRONT;
   if (me % WAVEFRONT == 0) {
-    do { spin(); } while(atomic_load((atomic_uint *) &ready[pos]) == 0);
+    do { spin(); } while(atomic_load_explicit((atomic_uint *) &ready[pos], memory_order_relaxed, memory_scope_device) == 0);
   }
+#if HAS_ASM
+  __asm("s_setprio 1");
+#endif
   mem_fence(CLK_GLOBAL_MEM_FENCE);
 
   // Clear carry ready flag for next iteration
@@ -201,6 +222,8 @@ KERNEL(G_W) carryFused(P(T2) out, CP(T2) in, u32 posROE, P(i64) carryShuttle, P(
 
   bar();
 
-  fft_WIDTH(lds, u, smallTrig);
+//  fft_WIDTH(lds, u, smallTrig);
+  new_fft_WIDTH2(lds, u, smallTrig);
+
   writeCarryFusedLine(u, out, line);
 }