@@ -689,6 +689,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
689689 cuda_memcpy_async_to_cpu (&lut_indexes, lut->get_lut_indexes (0 , 0 ),
690690 lut->num_blocks * sizeof (Torus), streams[0 ],
691691 gpu_indexes[0 ]);
692+ cuda_synchronize_stream (streams[0 ], gpu_indexes[0 ]);
692693 for (uint i = 0 ; i < num_radix_blocks; i++) {
693694 lwe_array_out->degrees [i] = lut->degrees [lut_indexes[i]];
694695 lwe_array_out->noise_levels [i] = NoiseLevel::NOMINAL;
@@ -964,6 +965,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
964965 cuda_memcpy_async_to_cpu (&lut_indexes, lut->get_lut_indexes (0 , 0 ),
965966 lut->num_blocks * sizeof (Torus), streams[0 ],
966967 gpu_indexes[0 ]);
968+ cuda_synchronize_stream (streams[0 ], gpu_indexes[0 ]);
967969 for (uint i = 0 ; i < lwe_array_out->num_radix_blocks ; i++) {
968970 lwe_array_out->degrees [i] = lut->degrees [i % lut->num_blocks ];
969971 lwe_array_out->noise_levels [i] = NoiseLevel::NOMINAL;
@@ -1173,6 +1175,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
11731175 cuda_memcpy_async_to_cpu (&lut_indexes, lut->get_lut_indexes (0 , 0 ),
11741176 lut->num_blocks * sizeof (Torus), streams[0 ],
11751177 gpu_indexes[0 ]);
1178+ cuda_synchronize_stream (streams[0 ], gpu_indexes[0 ]);
11761179 for (uint i = 0 ; i < num_radix_blocks; i++) {
11771180 lwe_array_out->degrees [i] = lut->degrees [lut_indexes[i]];
11781181 lwe_array_out->noise_levels [i] = NoiseLevel::NOMINAL;
@@ -1974,7 +1977,8 @@ void host_compute_shifted_blocks_and_borrow_states(
19741977template <typename Torus>
19751978void host_full_propagate_inplace (cudaStream_t const *streams,
19761979 uint32_t const *gpu_indexes,
1977- uint32_t gpu_count, Torus *input_blocks,
1980+ uint32_t gpu_count,
1981+ CudaRadixCiphertextFFI *input_blocks,
19781982 int_fullprop_buffer<Torus> *mem_ptr,
19791983 Torus *const *ksks, void *const *bsks,
19801984 uint32_t num_blocks) {
@@ -1987,39 +1991,51 @@ void host_full_propagate_inplace(cudaStream_t const *streams,
19871991 uint32_t num_many_lut = 1 ;
19881992 uint32_t lut_stride = 0 ;
19891993 for (int i = 0 ; i < num_blocks; i++) {
1990- auto cur_input_block = &input_blocks[i * big_lwe_size];
1994+ CudaRadixCiphertextFFI cur_input_block;
1995+ as_radix_ciphertext_slice<Torus>(&cur_input_block, input_blocks, i, i + 1 );
19911996
19921997 // / Since the keyswitch is done on one input only, use only 1 GPU
19931998 execute_keyswitch_async<Torus>(
1994- streams, gpu_indexes, 1 , mem_ptr->tmp_small_lwe_vector ,
1995- mem_ptr->lut ->lwe_trivial_indexes , cur_input_block,
1999+ streams, gpu_indexes, 1 , (Torus *)( mem_ptr->tmp_small_lwe_vector -> ptr ) ,
2000+ mem_ptr->lut ->lwe_trivial_indexes , (Torus *) cur_input_block. ptr ,
19962001 mem_ptr->lut ->lwe_trivial_indexes , ksks, params.big_lwe_dimension ,
19972002 params.small_lwe_dimension , params.ks_base_log , params.ks_level , 1 );
19982003
1999- cuda_memcpy_async_gpu_to_gpu (&mem_ptr->tmp_small_lwe_vector [small_lwe_size],
2000- mem_ptr->tmp_small_lwe_vector ,
2001- small_lwe_size * sizeof (Torus), streams[0 ],
2002- gpu_indexes[0 ]);
2004+ copy_radix_ciphertext_slice_async<Torus>(
2005+ streams[0 ], gpu_indexes[0 ], mem_ptr->tmp_small_lwe_vector , 1 , 2 ,
2006+ mem_ptr->tmp_small_lwe_vector , 0 , 1 );
20032007
20042008 execute_pbs_async<Torus>(
2005- streams, gpu_indexes, 1 , mem_ptr->tmp_big_lwe_vector ,
2009+ streams, gpu_indexes, 1 , (Torus *) mem_ptr->tmp_big_lwe_vector -> ptr ,
20062010 mem_ptr->lut ->lwe_trivial_indexes , mem_ptr->lut ->lut_vec ,
2007- mem_ptr->lut ->lut_indexes_vec , mem_ptr->tmp_small_lwe_vector ,
2011+ mem_ptr->lut ->lut_indexes_vec ,
2012+ (Torus *)mem_ptr->tmp_small_lwe_vector ->ptr ,
20082013 mem_ptr->lut ->lwe_trivial_indexes , bsks, mem_ptr->lut ->buffer ,
20092014 params.glwe_dimension , params.small_lwe_dimension ,
20102015 params.polynomial_size , params.pbs_base_log , params.pbs_level ,
20112016 params.grouping_factor , 2 , params.pbs_type , num_many_lut, lut_stride);
20122017
2013- cuda_memcpy_async_gpu_to_gpu (
2014- (void *)cur_input_block, mem_ptr->tmp_big_lwe_vector ,
2015- big_lwe_size * sizeof (Torus), streams[0 ], gpu_indexes[0 ]);
2018+ copy_radix_ciphertext_slice_async<Torus>(streams[0 ], gpu_indexes[0 ],
2019+ &cur_input_block, 0 , 1 ,
2020+ mem_ptr->tmp_big_lwe_vector , 0 , 1 );
2021+ Torus lut_indexes[mem_ptr->lut ->num_blocks ];
2022+ cuda_memcpy_async_to_cpu (&lut_indexes, mem_ptr->lut ->get_lut_indexes (0 , 0 ),
2023+ mem_ptr->lut ->num_blocks * sizeof (Torus),
2024+ streams[0 ], gpu_indexes[0 ]);
2025+ cuda_synchronize_stream (streams[0 ], gpu_indexes[0 ]);
2026+ input_blocks->degrees [i] = mem_ptr->lut ->degrees [lut_indexes[0 ]];
2027+ input_blocks->noise_levels [i] = NoiseLevel::NOMINAL;
20162028
20172029 if (i < num_blocks - 1 ) {
2018- auto next_input_block = &input_blocks[(i + 1 ) * big_lwe_size];
2019- legacy_host_addition<Torus>(streams[0 ], gpu_indexes[0 ], next_input_block,
2020- (Torus const *)next_input_block,
2021- &mem_ptr->tmp_big_lwe_vector [big_lwe_size],
2022- params.big_lwe_dimension , 1 );
2030+ CudaRadixCiphertextFFI next_input_block;
2031+ as_radix_ciphertext_slice<Torus>(&next_input_block, input_blocks, i + 1 ,
2032+ i + 2 );
2033+ CudaRadixCiphertextFFI second_input;
2034+ as_radix_ciphertext_slice<Torus>(&second_input,
2035+ mem_ptr->tmp_big_lwe_vector , 1 , 2 );
2036+
2037+ host_addition<Torus>(streams[0 ], gpu_indexes[0 ], &next_input_block,
2038+ &next_input_block, &second_input, 1 );
20232039 }
20242040 }
20252041}
0 commit comments