Skip to content

Commit 8b1a21b

Browse files
P-1 Factoring stage 2 faster around 20% pre sieve gap + memory handling use more memory on the gpu to precompute more even pow (it avoid a forward transform on it most of time
1 parent ff7465f commit 8b1a21b

File tree

7 files changed

+504
-31
lines changed

7 files changed

+504
-31
lines changed

include/core/AlgoUtils.hpp

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,117 @@ static inline unsigned long evenGapBound(const mpz_class& B2) {
341341
return bound < 2 ? 1 : static_cast<unsigned long>(bound);
342342
}
343343

344+
#ifdef __APPLE__
345+
# include <OpenCL/opencl.h>
346+
#else
347+
# include <CL/cl.h>
348+
#endif
349+
#include <iomanip>
350+
#include <sstream>
351+
#include <vector>
352+
#include <cmath>
353+
354+
static inline std::string fmt_bytes_u64(unsigned long long x){
355+
static const char* u[] = {"B","KB","MB","GB","TB"};
356+
int i=0; double d = (double)x;
357+
while(d>=1024.0 && i<4){ d/=1024.0; ++i; }
358+
std::ostringstream s; s<<std::fixed<<std::setprecision(d<10?2:(d<100?1:0))<<d<<' '<<u[i];
359+
return s.str();
360+
}
361+
362+
struct GpuMemProbe {
363+
cl_ulong total_bytes = 0;
364+
std::string name;
365+
std::string vendor;
366+
int picked_index = -1;
367+
};
368+
369+
static inline GpuMemProbe gpu_probe_mem_by_index_verbose(int device_index){
370+
GpuMemProbe out;
371+
cl_uint np = 0;
372+
cl_int st = clGetPlatformIDs(0,nullptr,&np);
373+
if (st!=CL_SUCCESS || np==0){
374+
std::cout << "[evenGapBound2] OpenCL: no platform found (code " << st << ")\n";
375+
return out;
376+
}
377+
std::vector<cl_platform_id> plats(np);
378+
clGetPlatformIDs(np, plats.data(), nullptr);
379+
380+
int seen = 0;
381+
std::cout << "[evenGapBound2] OpenCL GPU inventory:\n";
382+
for (cl_uint i=0;i<np;i++){
383+
cl_uint nd=0;
384+
if (clGetDeviceIDs(plats[i], CL_DEVICE_TYPE_GPU, 0, nullptr, &nd)!=CL_SUCCESS || nd==0) continue;
385+
std::vector<cl_device_id> devs(nd);
386+
if (clGetDeviceIDs(plats[i], CL_DEVICE_TYPE_GPU, nd, devs.data(), nullptr)!=CL_SUCCESS) continue;
387+
for (cl_uint j=0;j<nd;j++,seen++){
388+
char name[256]={0}, vendor[256]={0};
389+
cl_ulong mem=0;
390+
clGetDeviceInfo(devs[j], CL_DEVICE_NAME, sizeof(name), name, nullptr);
391+
clGetDeviceInfo(devs[j], CL_DEVICE_VENDOR, sizeof(vendor), vendor, nullptr);
392+
clGetDeviceInfo(devs[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem), &mem, nullptr);
393+
std::cout << " ["<<seen<<"] " << vendor << " | " << name
394+
<< " | " << fmt_bytes_u64((unsigned long long)mem) << "\n";
395+
if (seen==device_index){
396+
out.total_bytes = mem;
397+
out.name = name;
398+
out.vendor = vendor;
399+
out.picked_index = seen;
400+
}
401+
}
402+
}
403+
if (out.picked_index<0){
404+
std::cout << "[evenGapBound2] GPU index " << device_index << " not found, falling back.\n";
405+
} else {
406+
std::cout << "[evenGapBound2] Selected: ["<<out.picked_index<<"] " << out.vendor << " | " << out.name
407+
<< " | " << fmt_bytes_u64((unsigned long long)out.total_bytes) << "\n";
408+
}
409+
return out;
410+
}
411+
412+
413+
static inline unsigned long evenGapBound2(const mpz_class& B2,
414+
int device_index,
415+
size_t transformSize,
416+
size_t baseRegsTotal,
417+
double fraction_use = 0.55)
418+
{
419+
auto probe = gpu_probe_mem_by_index_verbose(device_index);
420+
if (!probe.total_bytes){
421+
std::cout << "[evenGapBound2] Could not query GPU memory. Falling back to evenGapBound(B2).\n";
422+
return evenGapBound(B2);
423+
}
424+
425+
if (fraction_use < 0.10) fraction_use = 0.10;
426+
if (fraction_use > 0.80) fraction_use = 0.80;
427+
428+
const size_t bytes_per_coeff = 16; // per-register footprint per transform slot
429+
const size_t reg_bytes = transformSize * bytes_per_coeff;
430+
431+
size_t budget = (size_t)((double)probe.total_bytes * fraction_use);
432+
const size_t fixed_overhead = 256ull<<20; // 256 MB headroom
433+
if (budget > fixed_overhead) budget -= fixed_overhead; else budget = 0;
434+
435+
std::cout << "[evenGapBound2] Memory budget: "
436+
<< fmt_bytes_u64((unsigned long long)budget)
437+
<< " | reg_bytes=" << fmt_bytes_u64((unsigned long long)reg_bytes)
438+
<< " | baseRegs=" << baseRegsTotal << "\n";
439+
440+
if (budget <= baseRegsTotal * reg_bytes){
441+
std::cout << "[evenGapBound2] Not enough budget for extra REVEN registers. nbEven=1\n";
442+
return 1ul;
443+
}
444+
445+
const size_t avail = budget - baseRegsTotal * reg_bytes;
446+
unsigned long nbEven_by_mem = (unsigned long)(avail / reg_bytes);
447+
448+
if (nbEven_by_mem < 1ul) nbEven_by_mem = 1ul;
449+
450+
std::cout << "[evenGapBound2] Using maximum memory-based nbEven=" << nbEven_by_mem << "\n";
451+
return nbEven_by_mem;
452+
}
453+
454+
344455
static inline size_t primeCountApprox(const mpz_class& low, const mpz_class& high) {
345456
auto li = [](double x) {
346457
double l = std::log(x);

include/core/Version.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#include <string>
55

66
namespace core {
7-
const std::string PRMERS_VERSION = "4.15.40-alpha";
7+
const std::string PRMERS_VERSION = "4.15.41-alpha";
88
} // namespace core
99

1010
#endif // VERSION_HPP

include/io/CliParser.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ struct CliOptions {
2929
bool verify = true;
3030
bool gerbiczli = true;
3131
uint64_t B1 = 10000;
32+
uint64_t memlim = 70;
3233
uint64_t B1old = 0;
3334
uint64_t B1_new = 0;
3435
uint64_t B2 = 0;

include/marin/engine.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class engine
3636
virtual void square_mul(const Reg src, const uint32 a = 1) const = 0;
3737
// dst = multiplicand(src). A multiplicand is the src of the mul operation.
3838
virtual void set_multiplicand(const Reg dst, const Reg src) const = 0;
39+
virtual void set_multiplicand2(const Reg dst, const Reg src) const = 0;
3940
// dst = dst * src * a. src must be a multiplicand, created with set_multiplicand.
4041
virtual void mul(const Reg dst, const Reg src, const uint32 a = 1) const = 0;
4142
// src = src - a
@@ -47,6 +48,7 @@ class engine
4748
virtual void addsub(const Reg sum_out, const Reg diff_out, const Reg a, const Reg b) const = 0;
4849

4950
virtual void square_mul_copy(const Reg src, const Reg dst_copy, const uint32 a = 1) const = 0;
51+
virtual void mul_new(const Reg dst, const Reg src, const uint32 a = 1) const = 0;
5052

5153
virtual void mul_copy(const Reg dst, const Reg src, const Reg dst_copy, const uint32 a = 1) const = 0;
5254
virtual void addsub_copy(const Reg sum, const Reg diff, const Reg sum_copy, const Reg diff_copy,

include/marin/engine_gpu.h

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,67 @@ class engine_gpu : public engine
896896
}
897897
}
898898

899+
void set_multiplicand2(const Reg rdst, const Reg rsrc) const override
900+
{
901+
copy(rdst, rsrc);
902+
903+
const size_t n = _n, dst = size_t(rdst);
904+
905+
switch (n)
906+
{
907+
//case 1u << 2: _gpu->mul4x1(dst, src); break;
908+
//case 1u << 3: _gpu->mul8(dst, src); break;
909+
case 1u << 4: _gpu->forward4_0(dst); break;
910+
case 1u << 5: _gpu->forward4_0(dst); break;
911+
case 1u << 6: _gpu->forward16_0(dst); break;
912+
case 1u << 7: _gpu->forward16_0(dst); break;
913+
case 1u << 8: _gpu->forward16_0(dst); break;
914+
case 1u << 9: _gpu->forward16_0(dst); break;
915+
case 1u << 10: _gpu->forward16_0(dst); break;
916+
case 1u << 11: _gpu->forward16_0(dst); break;
917+
case 1u << 12: _gpu->forward64_0(dst); break;
918+
case 1u << 13: _gpu->forward64_0(dst); break;
919+
case 1u << 14: _gpu->forward64_0(dst); break;
920+
case 1u << 15: _gpu->forward64_0(dst); break;
921+
case 1u << 16: _gpu->forward64_0(dst); break;
922+
case 1u << 17: _gpu->forward256_0(dst); break;
923+
case 1u << 18: _gpu->forward256_0(dst); break;
924+
case 1u << 19: _gpu->forward1024_0(dst); break;
925+
case 1u << 20: _gpu->forward1024_0(dst); break;
926+
case 1u << 21: _gpu->forward64_0(dst); _gpu->forward64(dst, 1024, 8); break;
927+
case 1u << 22: _gpu->forward64_0(dst); _gpu->forward64(dst, 1024, 9); break;
928+
case 1u << 23: _gpu->forward64_0(dst); _gpu->forward256(dst, 4096, 8); break;
929+
case 1u << 24: _gpu->forward64_0(dst); _gpu->forward256(dst, 4096, 9); break;
930+
case 1u << 25: _gpu->forward256_0(dst); _gpu->forward256(dst, 16384, 8); break;
931+
case 1u << 26: _gpu->forward256_0(dst); _gpu->forward256(dst, 16384, 9); break;
932+
933+
case 5u << 3: _gpu->forward5_0(dst); break;
934+
case 5u << 4: _gpu->forward20_0(dst); break;
935+
case 5u << 5: _gpu->forward20_0(dst); break;
936+
case 5u << 6: _gpu->forward20_0(dst); break;
937+
case 5u << 7: _gpu->forward20_0(dst); break;
938+
case 5u << 8: _gpu->forward20_0(dst); break;
939+
case 5u << 9: _gpu->forward20_0(dst); break;
940+
case 5u << 10: _gpu->forward80_0(dst); break;
941+
case 5u << 11: _gpu->forward80_0(dst); break;
942+
case 5u << 12: _gpu->forward80_0(dst); break;
943+
case 5u << 13: _gpu->forward80_0(dst); break;
944+
case 5u << 14: _gpu->forward320_0(dst); break;
945+
case 5u << 15: _gpu->forward320_0(dst); break;
946+
case 5u << 16: _gpu->forward320_0(dst); break;
947+
case 5u << 17: _gpu->forward80_0(dst); _gpu->forward64(dst, 1280, 6); break;
948+
case 5u << 18: _gpu->forward80_0(dst); _gpu->forward64(dst, 1280, 7); break;
949+
case 5u << 19: _gpu->forward80_0(dst); _gpu->forward256(dst, 5120, 6); break;
950+
case 5u << 20: _gpu->forward80_0(dst); _gpu->forward256(dst, 5120, 7); break;
951+
case 5u << 21: _gpu->forward80_0(dst); _gpu->forward256(dst, 5120, 8); break;
952+
case 5u << 22: _gpu->forward80_0(dst); _gpu->forward256(dst, 5120, 9); break;
953+
case 5u << 23: _gpu->forward320_0(dst); _gpu->forward256(dst, 20480, 8); break;
954+
case 5u << 24: _gpu->forward320_0(dst); _gpu->forward256(dst, 20480, 9); break;
955+
956+
default: throw std::runtime_error("An unexpected error has occurred.");
957+
}
958+
}
959+
899960
void mul(const Reg rdst, const Reg rsrc, const uint32 a = 1) const override
900961
{
901962
const size_t n = _n, dst = size_t(rdst), src = size_t(rsrc);
@@ -957,6 +1018,67 @@ class engine_gpu : public engine
9571018
_gpu->carry_weight_mul(dst, a);
9581019
}
9591020

1021+
void mul_new(const Reg rdst, const Reg rsrc, const uint32 a = 1) const override
1022+
{
1023+
const size_t n = _n, dst = size_t(rdst), src = size_t(rsrc);
1024+
1025+
switch (n)
1026+
{
1027+
case 1u << 2: _gpu->mul4x1(dst, src); break;
1028+
case 1u << 3: _gpu->mul8(dst, src); break;
1029+
case 1u << 4: _gpu->mul4(dst, src); _gpu->backward4_0(dst); break;
1030+
case 1u << 5: _gpu->mul8(dst, src); _gpu->backward4_0(dst); break;
1031+
case 1u << 6: _gpu->mul4(dst, src); _gpu->backward16_0(dst); break;
1032+
case 1u << 7: _gpu->mul8(dst, src); _gpu->backward16_0(dst); break;
1033+
case 1u << 8: _gpu->mul16(dst, src); _gpu->backward16_0(dst); break;
1034+
case 1u << 9: _gpu->mul32(dst, src); _gpu->backward16_0(dst); break;
1035+
case 1u << 10: _gpu->mul64(dst, src); _gpu->backward16_0(dst); break;
1036+
case 1u << 11: _gpu->mul128(dst, src); _gpu->backward16_0(dst); break;
1037+
case 1u << 12: _gpu->mul64(dst, src); _gpu->backward64_0(dst); break;
1038+
case 1u << 13: _gpu->mul128(dst, src); _gpu->backward64_0(dst); break;
1039+
case 1u << 14: _gpu->mul256(dst, src); _gpu->backward64_0(dst); break;
1040+
case 1u << 15: _gpu->mul512(dst, src); _gpu->backward64_0(dst); break;
1041+
case 1u << 16: _gpu->mul1024(dst, src); _gpu->backward64_0(dst); break;
1042+
case 1u << 17: _gpu->mul512(dst, src); _gpu->backward256_0(dst); break;
1043+
case 1u << 18: _gpu->mul1024(dst, src); _gpu->backward256_0(dst); break;
1044+
case 1u << 19: _gpu->mul512(dst, src); _gpu->backward1024_0(dst); break;
1045+
case 1u << 20: _gpu->mul1024(dst, src); _gpu->backward1024_0(dst); break;
1046+
case 1u << 21: _gpu->mul512(dst, src); _gpu->backward64(dst, 1024, 8); _gpu->backward64_0(dst); break;
1047+
case 1u << 22: _gpu->mul1024(dst, src); _gpu->backward64(dst, 1024, 9); _gpu->backward64_0(dst); break;
1048+
case 1u << 23: _gpu->mul512(dst, src); _gpu->backward256(dst, 4096, 8); _gpu->backward64_0(dst); break;
1049+
case 1u << 24: _gpu->mul1024(dst, src); _gpu->backward256(dst, 4096, 9); _gpu->backward64_0(dst); break;
1050+
case 1u << 25: _gpu->mul512(dst, src); _gpu->backward256(dst, 16384, 8); _gpu->backward256_0(dst); break;
1051+
case 1u << 26: _gpu->mul1024(dst, src); _gpu->backward256(dst, 16384, 9); _gpu->backward256_0(dst); break;
1052+
1053+
case 5u << 3: _gpu->mul8(dst, src); _gpu->backward5_0(dst); break;
1054+
case 5u << 4: _gpu->mul4(dst, src); _gpu->backward20_0(dst); break;
1055+
case 5u << 5: _gpu->mul8(dst, src); _gpu->backward20_0(dst); break;
1056+
case 5u << 6: _gpu->mul16(dst, src); _gpu->backward20_0(dst); break;
1057+
case 5u << 7: _gpu->mul32(dst, src); _gpu->backward20_0(dst); break;
1058+
case 5u << 8: _gpu->mul64(dst, src); _gpu->backward20_0(dst); break;
1059+
case 5u << 9: _gpu->mul128(dst, src); _gpu->backward20_0(dst); break;
1060+
case 5u << 10: _gpu->mul64(dst, src); _gpu->backward80_0(dst); break;
1061+
case 5u << 11: _gpu->mul128(dst, src); _gpu->backward80_0(dst); break;
1062+
case 5u << 12: _gpu->mul256(dst, src); _gpu->backward80_0(dst); break;
1063+
case 5u << 13: _gpu->mul512(dst, src); _gpu->backward80_0(dst); break;
1064+
case 5u << 14: _gpu->mul256(dst, src); _gpu->backward320_0(dst); break;
1065+
case 5u << 15: _gpu->mul512(dst, src); _gpu->backward320_0(dst); break;
1066+
case 5u << 16: _gpu->mul1024(dst, src); _gpu->backward320_0(dst); break;
1067+
case 5u << 17: _gpu->mul128(dst, src); _gpu->backward64(dst, 1280, 6); _gpu->backward80_0(dst); break;
1068+
case 5u << 18: _gpu->mul256(dst, src); _gpu->backward64(dst, 1280, 7); _gpu->backward80_0(dst); break;
1069+
case 5u << 19: _gpu->mul128(dst, src); _gpu->backward256(dst, 5120, 6); _gpu->backward80_0(dst); break;
1070+
case 5u << 20: _gpu->mul256(dst, src); _gpu->backward256(dst, 5120, 7); _gpu->backward80_0(dst); break;
1071+
case 5u << 21: _gpu->mul512(dst, src); _gpu->backward256(dst, 5120, 8); _gpu->backward80_0(dst); break;
1072+
case 5u << 22: _gpu->mul1024(dst, src); _gpu->backward256(dst, 5120, 9); _gpu->backward80_0(dst); break;
1073+
case 5u << 23: _gpu->mul512(dst, src); _gpu->backward256(dst, 20480, 8); _gpu->backward320_0(dst); break;
1074+
case 5u << 24: _gpu->mul1024(dst, src); _gpu->backward256(dst, 20480, 9); _gpu->backward320_0(dst); break;
1075+
1076+
default: throw std::runtime_error("An unexpected error has occurred.");
1077+
}
1078+
1079+
_gpu->carry_weight_mul(dst, a);
1080+
}
1081+
9601082
void mul_copy(const Reg rdst, const Reg rsrc, const Reg rdst_copy, const uint32 a = 1) const override
9611083
{
9621084
const size_t n = _n, dst = size_t(rdst), src = size_t(rsrc), dcopy = size_t(rdst_copy);

src/io/CliParser.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ void printUsage(const char* progName) {
111111

112112
std::cout << " -maxe <value> : (Optional) Max bits for each E chunk (in MiB). If set to 0, defaults to 10000 bits. Example: -maxe 64 → 64 MiB = 536870912 bits. By default if no -maxe you it is set to 32 Mib." << std::endl;
113113
std::cout << " -memtest : GPU Memory & Stability test (OpenCL)" << std::endl;
114-
114+
std::cout << " -memlim <percent> : (Optional) Fraction percentage of memory used (used precompute stage 2 p-1)" << std::endl;
115+
115116
//std::cout << " -throttle_low : (Optional) Enable CL_QUEUE_THROTTLE_LOW_KHR if OpenCL >= 2.2 (default: disabled)" << std::endl;
116117
//std::cout << " -tune : (Optional) Automatically determine the best pacing (iterForce) and how often to call clFinish() to synchronize kernels (default: disabled)" << std::endl;
117118
std::cout << std::endl;
@@ -226,6 +227,7 @@ CliOptions CliParser::parse(int argc, char** argv ) {
226227
opts.mode = "memtest";
227228
opts.exponent = 127;
228229
}
230+
229231
else if (std::strcmp(argv[i], "-bench") == 0) {
230232
opts.bench = true;
231233
opts.exponent = 127;
@@ -307,6 +309,10 @@ CliOptions CliParser::parse(int argc, char** argv ) {
307309
opts.nmax = std::strtoull(argv[i + 1], nullptr, 10); // base 10
308310
++i;
309311
}
312+
else if (std::strcmp(argv[i], "-memlim") == 0 && i + 1 < argc) {
313+
opts.memlim = std::strtoull(argv[i + 1], nullptr, 10); // base 10
314+
++i;
315+
}
310316
else if (std::strcmp(argv[i], "-seed") == 0 && i + 1 < argc) {
311317
opts.curve_seed = std::strtoull(argv[i + 1], nullptr, 10); // base 10
312318
++i;

0 commit comments

Comments
 (0)