@@ -896,6 +896,67 @@ class engine_gpu : public engine
896896 }
897897 }
898898
899+ void set_multiplicand2 (const Reg rdst, const Reg rsrc) const override
900+ {
901+ copy (rdst, rsrc);
902+
903+ const size_t n = _n, dst = size_t (rdst);
904+
905+ switch (n)
906+ {
907+ // case 1u << 2: _gpu->mul4x1(dst, src); break;
908+ // case 1u << 3: _gpu->mul8(dst, src); break;
909+ case 1u << 4 : _gpu->forward4_0 (dst); break ;
910+ case 1u << 5 : _gpu->forward4_0 (dst); break ;
911+ case 1u << 6 : _gpu->forward16_0 (dst); break ;
912+ case 1u << 7 : _gpu->forward16_0 (dst); break ;
913+ case 1u << 8 : _gpu->forward16_0 (dst); break ;
914+ case 1u << 9 : _gpu->forward16_0 (dst); break ;
915+ case 1u << 10 : _gpu->forward16_0 (dst); break ;
916+ case 1u << 11 : _gpu->forward16_0 (dst); break ;
917+ case 1u << 12 : _gpu->forward64_0 (dst); break ;
918+ case 1u << 13 : _gpu->forward64_0 (dst); break ;
919+ case 1u << 14 : _gpu->forward64_0 (dst); break ;
920+ case 1u << 15 : _gpu->forward64_0 (dst); break ;
921+ case 1u << 16 : _gpu->forward64_0 (dst); break ;
922+ case 1u << 17 : _gpu->forward256_0 (dst); break ;
923+ case 1u << 18 : _gpu->forward256_0 (dst); break ;
924+ case 1u << 19 : _gpu->forward1024_0 (dst); break ;
925+ case 1u << 20 : _gpu->forward1024_0 (dst); break ;
926+ case 1u << 21 : _gpu->forward64_0 (dst); _gpu->forward64 (dst, 1024 , 8 ); break ;
927+ case 1u << 22 : _gpu->forward64_0 (dst); _gpu->forward64 (dst, 1024 , 9 ); break ;
928+ case 1u << 23 : _gpu->forward64_0 (dst); _gpu->forward256 (dst, 4096 , 8 ); break ;
929+ case 1u << 24 : _gpu->forward64_0 (dst); _gpu->forward256 (dst, 4096 , 9 ); break ;
930+ case 1u << 25 : _gpu->forward256_0 (dst); _gpu->forward256 (dst, 16384 , 8 ); break ;
931+ case 1u << 26 : _gpu->forward256_0 (dst); _gpu->forward256 (dst, 16384 , 9 ); break ;
932+
933+ case 5u << 3 : _gpu->forward5_0 (dst); break ;
934+ case 5u << 4 : _gpu->forward20_0 (dst); break ;
935+ case 5u << 5 : _gpu->forward20_0 (dst); break ;
936+ case 5u << 6 : _gpu->forward20_0 (dst); break ;
937+ case 5u << 7 : _gpu->forward20_0 (dst); break ;
938+ case 5u << 8 : _gpu->forward20_0 (dst); break ;
939+ case 5u << 9 : _gpu->forward20_0 (dst); break ;
940+ case 5u << 10 : _gpu->forward80_0 (dst); break ;
941+ case 5u << 11 : _gpu->forward80_0 (dst); break ;
942+ case 5u << 12 : _gpu->forward80_0 (dst); break ;
943+ case 5u << 13 : _gpu->forward80_0 (dst); break ;
944+ case 5u << 14 : _gpu->forward320_0 (dst); break ;
945+ case 5u << 15 : _gpu->forward320_0 (dst); break ;
946+ case 5u << 16 : _gpu->forward320_0 (dst); break ;
947+ case 5u << 17 : _gpu->forward80_0 (dst); _gpu->forward64 (dst, 1280 , 6 ); break ;
948+ case 5u << 18 : _gpu->forward80_0 (dst); _gpu->forward64 (dst, 1280 , 7 ); break ;
949+ case 5u << 19 : _gpu->forward80_0 (dst); _gpu->forward256 (dst, 5120 , 6 ); break ;
950+ case 5u << 20 : _gpu->forward80_0 (dst); _gpu->forward256 (dst, 5120 , 7 ); break ;
951+ case 5u << 21 : _gpu->forward80_0 (dst); _gpu->forward256 (dst, 5120 , 8 ); break ;
952+ case 5u << 22 : _gpu->forward80_0 (dst); _gpu->forward256 (dst, 5120 , 9 ); break ;
953+ case 5u << 23 : _gpu->forward320_0 (dst); _gpu->forward256 (dst, 20480 , 8 ); break ;
954+ case 5u << 24 : _gpu->forward320_0 (dst); _gpu->forward256 (dst, 20480 , 9 ); break ;
955+
956+ default : throw std::runtime_error (" An unexpected error has occurred." );
957+ }
958+ }
959+
899960 void mul (const Reg rdst, const Reg rsrc, const uint32 a = 1 ) const override
900961 {
901962 const size_t n = _n, dst = size_t (rdst), src = size_t (rsrc);
@@ -957,6 +1018,67 @@ class engine_gpu : public engine
9571018 _gpu->carry_weight_mul (dst, a);
9581019 }
9591020
1021+ void mul_new (const Reg rdst, const Reg rsrc, const uint32 a = 1 ) const override
1022+ {
1023+ const size_t n = _n, dst = size_t (rdst), src = size_t (rsrc);
1024+
1025+ switch (n)
1026+ {
1027+ case 1u << 2 : _gpu->mul4x1 (dst, src); break ;
1028+ case 1u << 3 : _gpu->mul8 (dst, src); break ;
1029+ case 1u << 4 : _gpu->mul4 (dst, src); _gpu->backward4_0 (dst); break ;
1030+ case 1u << 5 : _gpu->mul8 (dst, src); _gpu->backward4_0 (dst); break ;
1031+ case 1u << 6 : _gpu->mul4 (dst, src); _gpu->backward16_0 (dst); break ;
1032+ case 1u << 7 : _gpu->mul8 (dst, src); _gpu->backward16_0 (dst); break ;
1033+ case 1u << 8 : _gpu->mul16 (dst, src); _gpu->backward16_0 (dst); break ;
1034+ case 1u << 9 : _gpu->mul32 (dst, src); _gpu->backward16_0 (dst); break ;
1035+ case 1u << 10 : _gpu->mul64 (dst, src); _gpu->backward16_0 (dst); break ;
1036+ case 1u << 11 : _gpu->mul128 (dst, src); _gpu->backward16_0 (dst); break ;
1037+ case 1u << 12 : _gpu->mul64 (dst, src); _gpu->backward64_0 (dst); break ;
1038+ case 1u << 13 : _gpu->mul128 (dst, src); _gpu->backward64_0 (dst); break ;
1039+ case 1u << 14 : _gpu->mul256 (dst, src); _gpu->backward64_0 (dst); break ;
1040+ case 1u << 15 : _gpu->mul512 (dst, src); _gpu->backward64_0 (dst); break ;
1041+ case 1u << 16 : _gpu->mul1024 (dst, src); _gpu->backward64_0 (dst); break ;
1042+ case 1u << 17 : _gpu->mul512 (dst, src); _gpu->backward256_0 (dst); break ;
1043+ case 1u << 18 : _gpu->mul1024 (dst, src); _gpu->backward256_0 (dst); break ;
1044+ case 1u << 19 : _gpu->mul512 (dst, src); _gpu->backward1024_0 (dst); break ;
1045+ case 1u << 20 : _gpu->mul1024 (dst, src); _gpu->backward1024_0 (dst); break ;
1046+ case 1u << 21 : _gpu->mul512 (dst, src); _gpu->backward64 (dst, 1024 , 8 ); _gpu->backward64_0 (dst); break ;
1047+ case 1u << 22 : _gpu->mul1024 (dst, src); _gpu->backward64 (dst, 1024 , 9 ); _gpu->backward64_0 (dst); break ;
1048+ case 1u << 23 : _gpu->mul512 (dst, src); _gpu->backward256 (dst, 4096 , 8 ); _gpu->backward64_0 (dst); break ;
1049+ case 1u << 24 : _gpu->mul1024 (dst, src); _gpu->backward256 (dst, 4096 , 9 ); _gpu->backward64_0 (dst); break ;
1050+ case 1u << 25 : _gpu->mul512 (dst, src); _gpu->backward256 (dst, 16384 , 8 ); _gpu->backward256_0 (dst); break ;
1051+ case 1u << 26 : _gpu->mul1024 (dst, src); _gpu->backward256 (dst, 16384 , 9 ); _gpu->backward256_0 (dst); break ;
1052+
1053+ case 5u << 3 : _gpu->mul8 (dst, src); _gpu->backward5_0 (dst); break ;
1054+ case 5u << 4 : _gpu->mul4 (dst, src); _gpu->backward20_0 (dst); break ;
1055+ case 5u << 5 : _gpu->mul8 (dst, src); _gpu->backward20_0 (dst); break ;
1056+ case 5u << 6 : _gpu->mul16 (dst, src); _gpu->backward20_0 (dst); break ;
1057+ case 5u << 7 : _gpu->mul32 (dst, src); _gpu->backward20_0 (dst); break ;
1058+ case 5u << 8 : _gpu->mul64 (dst, src); _gpu->backward20_0 (dst); break ;
1059+ case 5u << 9 : _gpu->mul128 (dst, src); _gpu->backward20_0 (dst); break ;
1060+ case 5u << 10 : _gpu->mul64 (dst, src); _gpu->backward80_0 (dst); break ;
1061+ case 5u << 11 : _gpu->mul128 (dst, src); _gpu->backward80_0 (dst); break ;
1062+ case 5u << 12 : _gpu->mul256 (dst, src); _gpu->backward80_0 (dst); break ;
1063+ case 5u << 13 : _gpu->mul512 (dst, src); _gpu->backward80_0 (dst); break ;
1064+ case 5u << 14 : _gpu->mul256 (dst, src); _gpu->backward320_0 (dst); break ;
1065+ case 5u << 15 : _gpu->mul512 (dst, src); _gpu->backward320_0 (dst); break ;
1066+ case 5u << 16 : _gpu->mul1024 (dst, src); _gpu->backward320_0 (dst); break ;
1067+ case 5u << 17 : _gpu->mul128 (dst, src); _gpu->backward64 (dst, 1280 , 6 ); _gpu->backward80_0 (dst); break ;
1068+ case 5u << 18 : _gpu->mul256 (dst, src); _gpu->backward64 (dst, 1280 , 7 ); _gpu->backward80_0 (dst); break ;
1069+ case 5u << 19 : _gpu->mul128 (dst, src); _gpu->backward256 (dst, 5120 , 6 ); _gpu->backward80_0 (dst); break ;
1070+ case 5u << 20 : _gpu->mul256 (dst, src); _gpu->backward256 (dst, 5120 , 7 ); _gpu->backward80_0 (dst); break ;
1071+ case 5u << 21 : _gpu->mul512 (dst, src); _gpu->backward256 (dst, 5120 , 8 ); _gpu->backward80_0 (dst); break ;
1072+ case 5u << 22 : _gpu->mul1024 (dst, src); _gpu->backward256 (dst, 5120 , 9 ); _gpu->backward80_0 (dst); break ;
1073+ case 5u << 23 : _gpu->mul512 (dst, src); _gpu->backward256 (dst, 20480 , 8 ); _gpu->backward320_0 (dst); break ;
1074+ case 5u << 24 : _gpu->mul1024 (dst, src); _gpu->backward256 (dst, 20480 , 9 ); _gpu->backward320_0 (dst); break ;
1075+
1076+ default : throw std::runtime_error (" An unexpected error has occurred." );
1077+ }
1078+
1079+ _gpu->carry_weight_mul (dst, a);
1080+ }
1081+
9601082 void mul_copy (const Reg rdst, const Reg rsrc, const Reg rdst_copy, const uint32 a = 1 ) const override
9611083 {
9621084 const size_t n = _n, dst = size_t (rdst), src = size_t (rsrc), dcopy = size_t (rdst_copy);
0 commit comments