@@ -164,8 +164,7 @@ void vampire(vamp_t min, vamp_t max, struct vargs *args, fang_t fmax)
164164 fang_t max_sqrt = sqrtv_floor (max );
165165
166166#if CACHE
167- length_t length_a = partition3 (length (max ));
168- fang_t power_a = pow_v (length_a );
167+ fang_t power_a = pow_v (partition3 (length (max )));
169168 digits_t * dig = args -> digptr -> dig ;
170169#if USE_PDEP
171170 const uint64_t pdep_mask = get_pdep_mask ();
@@ -196,6 +195,49 @@ void vampire(vamp_t min, vamp_t max, struct vargs *args, fang_t fmax)
196195 product *= multiplicand ; // avoid overflow
197196
198197#if CACHE
198+ /*
199+ * We could just allocate the entire dig[] array, and then do:
200+ *
201+ * for (; multiplicand <= multiplicand_max; multiplicand += BASE - 1) {
202+ * if (dig[multiplier] + dig[multiplicand] == dig[product]) {
203+ * ...
204+ * }
205+ * product += product_iterator;
206+ * multiplicand += BASE-1;
207+ * }
208+ *
209+ * This would work just fine.
210+ * The only problem is that the array would be way too
211+ * big to fit in most l3 caches and we would waste a
212+ * majority of time loading data from memory.
213+ *
214+ * If we 'partition' the numbers (123 -> 12, 3), we can
215+ * make the array much smaller.
216+ *
217+ * Of course, 'partitioning' requires some computation,
218+ * but we are already waiting for memory load
219+ * operations, and we might as well put the wasted
220+ * cycles to good use.
221+ *
222+ * I chose to 'partition' like this:
223+ * product: de0, de1, de2
224+ * product iterator: step0, step1, step2
225+ *
226+ * multiplicand: e0, e1
227+ * Because it performs the best on all of my cpus.
228+ */
229+
230+ /*
231+ * We can improve the runtime even further by removing step2.
232+ * If step2 is always 0, we don't need it.
233+ *
234+ * step2 = (product_iterator / power_a) / power_a
235+ *
236+ * step2 has 0 digits, product_iterator has n+1, and we are going to solve for power_a:
237+ *
238+ * 0 >= (n+1 - x) - x
239+ * x >= n+1 - x
240+ */
199241
200242 fang_t step0 = product_iterator % power_a ;
201243 fang_t step1 = product_iterator / power_a ;
0 commit comments