Skip to content

Commit 0349eed

Browse files
ECM faster create xdbl_tail_uv in marin
1 parent 8a9436b commit 0349eed

File tree

3 files changed

+59
-3
lines changed

3 files changed

+59
-3
lines changed

include/marin/engine.h

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,27 @@ class engine
8686
set_multiplicand(src1, src1);
8787
mul(dst1, src1);
8888
}
89-
89+
90+
virtual void mul_pair_prepared(const Reg dst0, const Reg mul_src0,
91+
const Reg dst1, const Reg mul_src1,
92+
const uint32 a0 = 1, const uint32 a1 = 1) const
93+
{
94+
mul(dst0, mul_src0, a0);
95+
mul(dst1, mul_src1, a1);
96+
}
97+
98+
virtual void xdbl_tail_uv(const Reg x_out, const Reg z_out,
99+
const Reg u_work, const Reg v_reg,
100+
const Reg a24_mul,
101+
const Reg tmp_e_mul, const Reg tmp_v_mul) const
102+
{
103+
sub_reg(u_work, v_reg);
104+
set_multiplicand(tmp_v_mul, v_reg);
105+
mul(x_out, tmp_v_mul);
106+
set_multiplicand(tmp_e_mul, u_work);
107+
mul_add(u_work, a24_mul, v_reg);
108+
mul_copy(u_work, tmp_e_mul, z_out);
109+
}
90110
virtual void addsub_copy(const Reg sum, const Reg diff, const Reg sum_copy, const Reg diff_copy,
91111
const Reg a, const Reg b) const
92112
{

include/marin/engine_gpu.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1272,7 +1272,37 @@ class engine_gpu : public engine
12721272
{
12731273
_gpu->addsub_copy((size_t)sum,(size_t)diff,(size_t)sum_copy,(size_t)diff_copy,(size_t)a,(size_t)b);
12741274
}
1275+
void mul_pair_prepared(const Reg rdst0, const Reg rsrc0,
1276+
const Reg rdst1, const Reg rsrc1,
1277+
const uint32 a0 = 1, const uint32 a1 = 1) const override
1278+
{
1279+
if ((a0 != 1) || (a1 != 1))
1280+
{
1281+
mul(rdst0, rsrc0, a0);
1282+
mul(rdst1, rsrc1, a1);
1283+
return;
1284+
}
1285+
1286+
const size_t dst0 = size_t(rdst0), src0 = size_t(rsrc0);
1287+
const size_t dst1 = size_t(rdst1), src1 = size_t(rsrc1);
1288+
1289+
mul_new_core(dst0, src0);
1290+
mul_new_core(dst1, src1);
1291+
_gpu->carry_weight_mul2_unit(dst0, dst1);
1292+
}
12751293

1294+
void xdbl_tail_uv(const Reg x_out, const Reg z_out,
1295+
const Reg u_work, const Reg v_reg,
1296+
const Reg a24_mul,
1297+
const Reg tmp_e_mul, const Reg tmp_v_mul) const override
1298+
{
1299+
sub_reg(u_work, v_reg);
1300+
set_multiplicand(tmp_e_mul, u_work);
1301+
set_multiplicand(tmp_v_mul, v_reg);
1302+
mul_pair_prepared(x_out, tmp_v_mul, u_work, a24_mul);
1303+
add(u_work, v_reg);
1304+
mul_copy(u_work, tmp_e_mul, z_out);
1305+
}
12761306
size_t get_register_data_size() const override { return _reg_count * _n * sizeof(uint64); }
12771307

12781308
bool get_data(std::vector<char> & data, const Reg src) const override

src/modes/RunEcm.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -792,7 +792,7 @@ uint32_t s2_idx = 0, s2_cnt = 0; double s2_et = 0.0;
792792
eng->mul((engine::Reg)Z2,(engine::Reg)13); // Z2=k13*...
793793

794794
// xDBL part (U,V,E with fused square+copy; no extra copy for Z1)
795-
eng->square_mul_copy((engine::Reg)25,(engine::Reg)X1); // 25=U=(X1+Z1)^2, and X1=U
795+
/*eng->square_mul_copy((engine::Reg)25,(engine::Reg)X1); // 25=U=(X1+Z1)^2, and X1=U
796796
eng->square_mul((engine::Reg)24); // 24=V=(X1−Z1)^2
797797
eng->sub_reg((engine::Reg)25,(engine::Reg)24); // 25=E=U−V
798798
eng->set_multiplicand((engine::Reg)15,(engine::Reg)24); // *V
@@ -801,7 +801,13 @@ uint32_t s2_idx = 0, s2_cnt = 0; double s2_et = 0.0;
801801
//eng->mul((engine::Reg)25,(engine::Reg)12); // 25=A24*E
802802
//eng->add((engine::Reg)25,(engine::Reg)24); // 25=A24*E + V
803803
eng->mul_add((engine::Reg)25,(engine::Reg)12,(engine::Reg)24);
804-
eng->mul_copy((engine::Reg)25,(engine::Reg)15,(engine::Reg)Z1); // Z1=(A24*E+V)*E
804+
eng->mul_copy((engine::Reg)25,(engine::Reg)15,(engine::Reg)Z1); // Z1=(A24*E+V)*E*/
805+
eng->square_mul_copy((engine::Reg)25,(engine::Reg)X1);
806+
eng->square_mul((engine::Reg)24);
807+
eng->xdbl_tail_uv((engine::Reg)X1, (engine::Reg)Z1,
808+
(engine::Reg)25, (engine::Reg)24,
809+
(engine::Reg)12,
810+
(engine::Reg)15, (engine::Reg)11);
805811
};
806812

807813
auto xDBLADD_strict2 = [&](size_t X1,size_t Z1, size_t X2,size_t Z2){

0 commit comments

Comments
 (0)