@@ -21,7 +21,7 @@ template <typename Torus>
2121__global__ void radix_blocks_rotate_right (Torus *dst, Torus *src,
2222 uint32_t value, uint32_t blocks_count,
2323 uint32_t lwe_size);
24- void generate_ids_update_degrees (int *terms_degree, size_t *h_lwe_idx_in,
24+ void generate_ids_update_degrees (uint64_t *terms_degree, size_t *h_lwe_idx_in,
2525 size_t *h_lwe_idx_out,
2626 int32_t *h_smart_copy_in,
2727 int32_t *h_smart_copy_out, size_t ch_amount,
@@ -1161,10 +1161,10 @@ template <typename Torus> struct int_overflowing_sub_memory {
11611161};
11621162
11631163template <typename Torus> struct int_sum_ciphertexts_vec_memory {
1164- Torus *new_blocks;
1165- Torus *new_blocks_copy;
1166- Torus *old_blocks;
1167- Torus *small_lwe_vector;
1164+ CudaRadixCiphertextFFI *new_blocks;
1165+ CudaRadixCiphertextFFI *new_blocks_copy;
1166+ CudaRadixCiphertextFFI *old_blocks;
1167+ CudaRadixCiphertextFFI *small_lwe_vector;
11681168 int_radix_params params;
11691169
11701170 int32_t *d_smart_copy_in;
@@ -1183,34 +1183,22 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
11831183 int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
11841184
11851185 // allocate gpu memory for intermediate buffers
1186- new_blocks = (Torus *)cuda_malloc_async (
1187- max_pbs_count * (params.big_lwe_dimension + 1 ) * sizeof (Torus),
1188- streams[0 ], gpu_indexes[0 ]);
1189- new_blocks_copy = (Torus *)cuda_malloc_async (
1190- max_pbs_count * (params.big_lwe_dimension + 1 ) * sizeof (Torus),
1191- streams[0 ], gpu_indexes[0 ]);
1192- old_blocks = (Torus *)cuda_malloc_async (
1193- max_pbs_count * (params.big_lwe_dimension + 1 ) * sizeof (Torus),
1194- streams[0 ], gpu_indexes[0 ]);
1195- small_lwe_vector = (Torus *)cuda_malloc_async (
1196- max_pbs_count * (params.small_lwe_dimension + 1 ) * sizeof (Torus),
1197- streams[0 ], gpu_indexes[0 ]);
1198- cuda_memset_async (new_blocks, 0 ,
1199- max_pbs_count * (params.big_lwe_dimension + 1 ) *
1200- sizeof (Torus),
1201- streams[0 ], gpu_indexes[0 ]);
1202- cuda_memset_async (new_blocks_copy, 0 ,
1203- max_pbs_count * (params.big_lwe_dimension + 1 ) *
1204- sizeof (Torus),
1205- streams[0 ], gpu_indexes[0 ]);
1206- cuda_memset_async (old_blocks, 0 ,
1207- max_pbs_count * (params.big_lwe_dimension + 1 ) *
1208- sizeof (Torus),
1209- streams[0 ], gpu_indexes[0 ]);
1210- cuda_memset_async (small_lwe_vector, 0 ,
1211- max_pbs_count * (params.small_lwe_dimension + 1 ) *
1212- sizeof (Torus),
1213- streams[0 ], gpu_indexes[0 ]);
1186+ new_blocks = new CudaRadixCiphertextFFI;
1187+ create_zero_radix_ciphertext_async<Torus>(streams[0 ], gpu_indexes[0 ],
1188+ new_blocks, max_pbs_count,
1189+ params.big_lwe_dimension );
1190+ new_blocks_copy = new CudaRadixCiphertextFFI;
1191+ create_zero_radix_ciphertext_async<Torus>(streams[0 ], gpu_indexes[0 ],
1192+ new_blocks_copy, max_pbs_count,
1193+ params.big_lwe_dimension );
1194+ old_blocks = new CudaRadixCiphertextFFI;
1195+ create_zero_radix_ciphertext_async<Torus>(streams[0 ], gpu_indexes[0 ],
1196+ old_blocks, max_pbs_count,
1197+ params.big_lwe_dimension );
1198+ small_lwe_vector = new CudaRadixCiphertextFFI;
1199+ create_zero_radix_ciphertext_async<Torus>(streams[0 ], gpu_indexes[0 ],
1200+ small_lwe_vector, max_pbs_count,
1201+ params.small_lwe_dimension );
12141202
12151203 d_smart_copy_in = (int32_t *)cuda_malloc_async (
12161204 max_pbs_count * sizeof (int32_t ), streams[0 ], gpu_indexes[0 ]);
@@ -1227,8 +1215,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
12271215 uint32_t gpu_count, int_radix_params params,
12281216 uint32_t num_blocks_in_radix,
12291217 uint32_t max_num_radix_in_vec,
1230- Torus *new_blocks, Torus *old_blocks,
1231- Torus *small_lwe_vector) {
1218+ CudaRadixCiphertextFFI *new_blocks,
1219+ CudaRadixCiphertextFFI *old_blocks,
1220+ CudaRadixCiphertextFFI *small_lwe_vector) {
12321221 mem_reuse = true ;
12331222 this ->params = params;
12341223
@@ -1238,13 +1227,10 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
12381227 this ->new_blocks = new_blocks;
12391228 this ->old_blocks = old_blocks;
12401229 this ->small_lwe_vector = small_lwe_vector;
1241- new_blocks_copy = (Torus *)cuda_malloc_async (
1242- max_pbs_count * (params.big_lwe_dimension + 1 ) * sizeof (Torus),
1243- streams[0 ], gpu_indexes[0 ]);
1244- cuda_memset_async (new_blocks_copy, 0 ,
1245- max_pbs_count * (params.big_lwe_dimension + 1 ) *
1246- sizeof (Torus),
1247- streams[0 ], gpu_indexes[0 ]);
1230+ new_blocks_copy = new CudaRadixCiphertextFFI;
1231+ create_zero_radix_ciphertext_async<Torus>(streams[0 ], gpu_indexes[0 ],
1232+ new_blocks_copy, max_pbs_count,
1233+ params.big_lwe_dimension );
12481234
12491235 d_smart_copy_in = (int32_t *)cuda_malloc_async (
12501236 max_pbs_count * sizeof (int32_t ), streams[0 ], gpu_indexes[0 ]);
@@ -1262,12 +1248,15 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
12621248 cuda_drop_async (d_smart_copy_out, streams[0 ], gpu_indexes[0 ]);
12631249
12641250 if (!mem_reuse) {
1265- cuda_drop_async (new_blocks, streams[0 ], gpu_indexes[0 ]);
1266- cuda_drop_async (old_blocks, streams[0 ], gpu_indexes[0 ]);
1267- cuda_drop_async (small_lwe_vector, streams[0 ], gpu_indexes[0 ]);
1251+ release_radix_ciphertext (streams[0 ], gpu_indexes[0 ], new_blocks);
1252+ delete new_blocks;
1253+ release_radix_ciphertext (streams[0 ], gpu_indexes[0 ], old_blocks);
1254+ delete old_blocks;
1255+ release_radix_ciphertext (streams[0 ], gpu_indexes[0 ], small_lwe_vector);
1256+ delete small_lwe_vector;
12681257 }
1269-
1270- cuda_drop_async (new_blocks_copy, streams[ 0 ], gpu_indexes[ 0 ]) ;
1258+ release_radix_ciphertext (streams[ 0 ], gpu_indexes[ 0 ], new_blocks_copy);
1259+ delete new_blocks_copy ;
12711260 }
12721261};
12731262// For sequential algorithm in group propagation
@@ -2482,7 +2471,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
24822471
24832472 int_radix_params params;
24842473
2485- Torus *tmp;
2474+ CudaRadixCiphertextFFI *tmp;
24862475
24872476 cudaStream_t *true_streams;
24882477 cudaStream_t *false_streams;
@@ -2495,10 +2484,11 @@ template <typename Torus> struct int_zero_out_if_buffer {
24952484 this ->params = params;
24962485 active_gpu_count = get_active_gpu_count (num_radix_blocks, gpu_count);
24972486
2498- Torus big_size =
2499- (params.big_lwe_dimension + 1 ) * num_radix_blocks * sizeof (Torus);
25002487 if (allocate_gpu_memory) {
2501- tmp = (Torus *)cuda_malloc_async (big_size, streams[0 ], gpu_indexes[0 ]);
2488+ tmp = new CudaRadixCiphertextFFI;
2489+ create_zero_radix_ciphertext_async<Torus>(streams[0 ], gpu_indexes[0 ], tmp,
2490+ num_radix_blocks,
2491+ params.big_lwe_dimension );
25022492 // We may use a different stream to allow concurrent operation
25032493 true_streams =
25042494 (cudaStream_t *)malloc (active_gpu_count * sizeof (cudaStream_t));
@@ -2512,7 +2502,8 @@ template <typename Torus> struct int_zero_out_if_buffer {
25122502 }
25132503 void release (cudaStream_t const *streams, uint32_t const *gpu_indexes,
25142504 uint32_t gpu_count) {
2515- cuda_drop_async (tmp, streams[0 ], gpu_indexes[0 ]);
2505+ release_radix_ciphertext (streams[0 ], gpu_indexes[0 ], tmp);
2506+ delete tmp;
25162507 for (uint j = 0 ; j < active_gpu_count; j++) {
25172508 cuda_destroy_stream (true_streams[j], gpu_indexes[j]);
25182509 cuda_destroy_stream (false_streams[j], gpu_indexes[j]);
@@ -2523,9 +2514,9 @@ template <typename Torus> struct int_zero_out_if_buffer {
25232514};
25242515
25252516template <typename Torus> struct int_mul_memory {
2526- Torus *vector_result_sb;
2527- Torus *block_mul_res;
2528- Torus *small_lwe_vector;
2517+ CudaRadixCiphertextFFI *vector_result_sb;
2518+ CudaRadixCiphertextFFI *block_mul_res;
2519+ CudaRadixCiphertextFFI *small_lwe_vector;
25292520
25302521 int_radix_lut<Torus> *luts_array; // lsb msb
25312522 int_radix_lut<Torus> *zero_out_predicate_lut;
@@ -2574,7 +2565,6 @@ template <typename Torus> struct int_mul_memory {
25742565 auto polynomial_size = params.polynomial_size ;
25752566 auto message_modulus = params.message_modulus ;
25762567 auto carry_modulus = params.carry_modulus ;
2577- auto lwe_dimension = params.small_lwe_dimension ;
25782568
25792569 // 'vector_result_lsb' contains blocks from all possible shifts of
25802570 // radix_lwe_left excluding zero ciphertext blocks
@@ -2587,17 +2577,18 @@ template <typename Torus> struct int_mul_memory {
25872577 int total_block_count = lsb_vector_block_count + msb_vector_block_count;
25882578
25892579 // allocate memory for intermediate buffers
2590- vector_result_sb = (Torus *)cuda_malloc_async (
2591- 2 * total_block_count * (polynomial_size * glwe_dimension + 1 ) *
2592- sizeof (Torus),
2593- streams[0 ], gpu_indexes[0 ]);
2594- block_mul_res = (Torus *)cuda_malloc_async (
2595- 2 * total_block_count * (polynomial_size * glwe_dimension + 1 ) *
2596- sizeof (Torus),
2597- streams[0 ], gpu_indexes[0 ]);
2598- small_lwe_vector = (Torus *)cuda_malloc_async (
2599- total_block_count * (lwe_dimension + 1 ) * sizeof (Torus), streams[0 ],
2600- gpu_indexes[0 ]);
2580+ vector_result_sb = new CudaRadixCiphertextFFI;
2581+ create_zero_radix_ciphertext_async<Torus>(
2582+ streams[0 ], gpu_indexes[0 ], vector_result_sb, 2 * total_block_count,
2583+ params.big_lwe_dimension );
2584+ block_mul_res = new CudaRadixCiphertextFFI;
2585+ create_zero_radix_ciphertext_async<Torus>(
2586+ streams[0 ], gpu_indexes[0 ], block_mul_res, 2 * total_block_count,
2587+ params.big_lwe_dimension );
2588+ small_lwe_vector = new CudaRadixCiphertextFFI;
2589+ create_zero_radix_ciphertext_async<Torus>(
2590+ streams[0 ], gpu_indexes[0 ], small_lwe_vector, total_block_count,
2591+ params.small_lwe_dimension );
26012592
26022593 // create int_radix_lut objects for lsb, msb, message, carry
26032594 // luts_array -> lut = {lsb_acc, msb_acc}
@@ -2658,9 +2649,12 @@ template <typename Torus> struct int_mul_memory {
26582649
26592650 return ;
26602651 }
2661- cuda_drop_async (vector_result_sb, streams[0 ], gpu_indexes[0 ]);
2662- cuda_drop_async (block_mul_res, streams[0 ], gpu_indexes[0 ]);
2663- cuda_drop_async (small_lwe_vector, streams[0 ], gpu_indexes[0 ]);
2652+ release_radix_ciphertext (streams[0 ], gpu_indexes[0 ], vector_result_sb);
2653+ delete vector_result_sb;
2654+ release_radix_ciphertext (streams[0 ], gpu_indexes[0 ], block_mul_res);
2655+ delete block_mul_res;
2656+ release_radix_ciphertext (streams[0 ], gpu_indexes[0 ], small_lwe_vector);
2657+ delete small_lwe_vector;
26642658
26652659 luts_array->release (streams, gpu_indexes, gpu_count);
26662660 sum_ciphertexts_mem->release (streams, gpu_indexes, gpu_count);
@@ -4435,7 +4429,7 @@ template <typename Torus> struct int_scalar_mul_buffer {
44354429 int_radix_params params;
44364430 int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_buffer;
44374431 int_sum_ciphertexts_vec_memory<Torus> *sum_ciphertexts_vec_mem;
4438- Torus *preshifted_buffer;
4432+ CudaRadixCiphertextFFI *preshifted_buffer;
44394433 CudaRadixCiphertextFFI *all_shifted_buffer;
44404434 int_sc_prop_memory<Torus> *sc_prop_mem;
44414435 bool anticipated_buffers_drop;
@@ -4450,25 +4444,21 @@ template <typename Torus> struct int_scalar_mul_buffer {
44504444
44514445 if (allocate_gpu_memory) {
44524446 uint32_t msg_bits = (uint32_t )std::log2 (params.message_modulus );
4453- uint32_t lwe_size = params.big_lwe_dimension + 1 ;
4454- uint32_t lwe_size_bytes = lwe_size * sizeof (Torus);
44554447 size_t num_ciphertext_bits = msg_bits * num_radix_blocks;
44564448
44574449 // // Contains all shifted values of lhs for shift in range (0..msg_bits)
44584450 // // The idea is that with these we can create all other shift that are
44594451 // / in / range (0..total_bits) for free (block rotation)
4460- preshifted_buffer = (Torus *)cuda_malloc_async (
4461- num_ciphertext_bits * lwe_size_bytes, streams[0 ], gpu_indexes[0 ]);
4452+ preshifted_buffer = new CudaRadixCiphertextFFI;
4453+ create_zero_radix_ciphertext_async<Torus>(
4454+ streams[0 ], gpu_indexes[0 ], preshifted_buffer, num_ciphertext_bits,
4455+ params.big_lwe_dimension );
44624456
44634457 all_shifted_buffer = new CudaRadixCiphertextFFI;
44644458 create_zero_radix_ciphertext_async<Torus>(
44654459 streams[0 ], gpu_indexes[0 ], all_shifted_buffer,
44664460 num_ciphertext_bits * num_radix_blocks, params.big_lwe_dimension );
44674461
4468- cuda_memset_async (preshifted_buffer, 0 ,
4469- num_ciphertext_bits * lwe_size_bytes, streams[0 ],
4470- gpu_indexes[0 ]);
4471-
44724462 if (num_ciphertext_bits * num_radix_blocks >= num_radix_blocks + 2 )
44734463 logical_scalar_shift_buffer =
44744464 new int_logical_scalar_shift_buffer<Torus>(
@@ -4500,7 +4490,8 @@ template <typename Torus> struct int_scalar_mul_buffer {
45004490 release_radix_ciphertext (streams[0 ], gpu_indexes[0 ], all_shifted_buffer);
45014491 delete all_shifted_buffer;
45024492 if (!anticipated_buffers_drop) {
4503- cuda_drop_async (preshifted_buffer, streams[0 ], gpu_indexes[0 ]);
4493+ release_radix_ciphertext (streams[0 ], gpu_indexes[0 ], preshifted_buffer);
4494+ delete preshifted_buffer;
45044495 logical_scalar_shift_buffer->release (streams, gpu_indexes, gpu_count);
45054496 delete (logical_scalar_shift_buffer);
45064497 }
0 commit comments