chore(gpu): refactor full propagation to track noise / degree

agnesLeroy · agnesLeroy · commit 8962d1f92536 · 2025-03-05T11:06:30.000+01:00
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -113,7 +113,8 @@ void scratch_cuda_full_propagation_64(
 
 void cuda_full_propagation_64_inplace(void *const *streams,
                                       uint32_t const *gpu_indexes,
-                                      uint32_t gpu_count, void *input_blocks,
+                                      uint32_t gpu_count,
+                                      CudaRadixCiphertextFFI *input_blocks,
                                       int8_t *mem_ptr, void *const *ksks,
                                       void *const *bsks, uint32_t num_blocks);
 
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -841,8 +841,8 @@ template <typename Torus> struct int_fullprop_buffer {
 
   int_radix_lut<Torus> *lut;
 
-  Torus *tmp_small_lwe_vector;
-  Torus *tmp_big_lwe_vector;
+  CudaRadixCiphertextFFI *tmp_small_lwe_vector;
+  CudaRadixCiphertextFFI *tmp_big_lwe_vector;
 
   int_fullprop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                       uint32_t gpu_count, int_radix_params params,
@@ -889,17 +889,14 @@ template <typename Torus> struct int_fullprop_buffer {
 
       lut->broadcast_lut(streams, gpu_indexes, 0);
 
-      // Temporary arrays
-      Torus small_vector_size =
-          2 * (params.small_lwe_dimension + 1) * sizeof(Torus);
-      Torus big_vector_size =
-          2 * (params.glwe_dimension * params.polynomial_size + 1) *
-          sizeof(Torus);
-
-      tmp_small_lwe_vector = (Torus *)cuda_malloc_async(
-          small_vector_size, streams[0], gpu_indexes[0]);
-      tmp_big_lwe_vector = (Torus *)cuda_malloc_async(
-          big_vector_size, streams[0], gpu_indexes[0]);
+      tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
+      create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                                tmp_small_lwe_vector, 2,
+                                                params.small_lwe_dimension);
+      tmp_big_lwe_vector = new CudaRadixCiphertextFFI;
+      create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                                tmp_big_lwe_vector, 2,
+                                                params.big_lwe_dimension);
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
       free(h_lwe_indexes);
     }
@@ -911,8 +908,10 @@ template <typename Torus> struct int_fullprop_buffer {
     lut->release(streams, gpu_indexes, 1);
     delete lut;
 
-    cuda_drop_async(tmp_small_lwe_vector, streams[0], gpu_indexes[0]);
-    cuda_drop_async(tmp_big_lwe_vector, streams[0], gpu_indexes[0]);
+    release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_small_lwe_vector);
+    delete tmp_small_lwe_vector;
+    release_radix_ciphertext(streams[0], gpu_indexes[0], tmp_big_lwe_vector);
+    delete tmp_big_lwe_vector;
   }
 };
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -4,17 +4,17 @@
 
 void cuda_full_propagation_64_inplace(void *const *streams,
                                       uint32_t const *gpu_indexes,
-                                      uint32_t gpu_count, void *input_blocks,
+                                      uint32_t gpu_count,
+                                      CudaRadixCiphertextFFI *input_blocks,
                                       int8_t *mem_ptr, void *const *ksks,
                                       void *const *bsks, uint32_t num_blocks) {
 
   int_fullprop_buffer<uint64_t> *buffer =
       (int_fullprop_buffer<uint64_t> *)mem_ptr;
 
-  host_full_propagate_inplace<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(input_blocks), buffer, (uint64_t **)(ksks), bsks,
-      num_blocks);
+  host_full_propagate_inplace<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
+                                        gpu_count, input_blocks, buffer,
+                                        (uint64_t **)(ksks), bsks, num_blocks);
 }
 
 void scratch_cuda_full_propagation_64(
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -689,6 +689,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
   cuda_memcpy_async_to_cpu(&lut_indexes, lut->get_lut_indexes(0, 0),
                            lut->num_blocks * sizeof(Torus), streams[0],
                            gpu_indexes[0]);
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
   for (uint i = 0; i < num_radix_blocks; i++) {
     lwe_array_out->degrees[i] = lut->degrees[lut_indexes[i]];
     lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
@@ -964,6 +965,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
   cuda_memcpy_async_to_cpu(&lut_indexes, lut->get_lut_indexes(0, 0),
                            lut->num_blocks * sizeof(Torus), streams[0],
                            gpu_indexes[0]);
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
   for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
     lwe_array_out->degrees[i] = lut->degrees[i % lut->num_blocks];
     lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
@@ -1173,6 +1175,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
   cuda_memcpy_async_to_cpu(&lut_indexes, lut->get_lut_indexes(0, 0),
                            lut->num_blocks * sizeof(Torus), streams[0],
                            gpu_indexes[0]);
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
   for (uint i = 0; i < num_radix_blocks; i++) {
     lwe_array_out->degrees[i] = lut->degrees[lut_indexes[i]];
     lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
@@ -1974,7 +1977,8 @@ void host_compute_shifted_blocks_and_borrow_states(
 template <typename Torus>
 void host_full_propagate_inplace(cudaStream_t const *streams,
                                  uint32_t const *gpu_indexes,
-                                 uint32_t gpu_count, Torus *input_blocks,
+                                 uint32_t gpu_count,
+                                 CudaRadixCiphertextFFI *input_blocks,
                                  int_fullprop_buffer<Torus> *mem_ptr,
                                  Torus *const *ksks, void *const *bsks,
                                  uint32_t num_blocks) {
@@ -1987,39 +1991,51 @@ void host_full_propagate_inplace(cudaStream_t const *streams,
   uint32_t num_many_lut = 1;
   uint32_t lut_stride = 0;
   for (int i = 0; i < num_blocks; i++) {
-    auto cur_input_block = &input_blocks[i * big_lwe_size];
+    CudaRadixCiphertextFFI cur_input_block;
+    as_radix_ciphertext_slice<Torus>(&cur_input_block, input_blocks, i, i + 1);
 
     /// Since the keyswitch is done on one input only, use only 1 GPU
     execute_keyswitch_async<Torus>(
-        streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector,
-        mem_ptr->lut->lwe_trivial_indexes, cur_input_block,
+        streams, gpu_indexes, 1, (Torus *)(mem_ptr->tmp_small_lwe_vector->ptr),
+        mem_ptr->lut->lwe_trivial_indexes, (Torus *)cur_input_block.ptr,
         mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension,
         params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);
 
-    cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
-                                 mem_ptr->tmp_small_lwe_vector,
-                                 small_lwe_size * sizeof(Torus), streams[0],
-                                 gpu_indexes[0]);
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector, 1, 2,
+        mem_ptr->tmp_small_lwe_vector, 0, 1);
 
     execute_pbs_async<Torus>(
-        streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector,
+        streams, gpu_indexes, 1, (Torus *)mem_ptr->tmp_big_lwe_vector->ptr,
         mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
-        mem_ptr->lut->lut_indexes_vec, mem_ptr->tmp_small_lwe_vector,
+        mem_ptr->lut->lut_indexes_vec,
+        (Torus *)mem_ptr->tmp_small_lwe_vector->ptr,
         mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
         params.glwe_dimension, params.small_lwe_dimension,
         params.polynomial_size, params.pbs_base_log, params.pbs_level,
         params.grouping_factor, 2, params.pbs_type, num_many_lut, lut_stride);
 
-    cuda_memcpy_async_gpu_to_gpu(
-        (void *)cur_input_block, mem_ptr->tmp_big_lwe_vector,
-        big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                             &cur_input_block, 0, 1,
+                                             mem_ptr->tmp_big_lwe_vector, 0, 1);
+    Torus lut_indexes[mem_ptr->lut->num_blocks];
+    cuda_memcpy_async_to_cpu(&lut_indexes, mem_ptr->lut->get_lut_indexes(0, 0),
+                             mem_ptr->lut->num_blocks * sizeof(Torus),
+                             streams[0], gpu_indexes[0]);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    input_blocks->degrees[i] = mem_ptr->lut->degrees[lut_indexes[0]];
+    input_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
 
     if (i < num_blocks - 1) {
-      auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
-      legacy_host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
-                                  (Torus const *)next_input_block,
-                                  &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
-                                  params.big_lwe_dimension, 1);
+      CudaRadixCiphertextFFI next_input_block;
+      as_radix_ciphertext_slice<Torus>(&next_input_block, input_blocks, i + 1,
+                                       i + 2);
+      CudaRadixCiphertextFFI second_input;
+      as_radix_ciphertext_slice<Torus>(&second_input,
+                                       mem_ptr->tmp_big_lwe_vector, 1, 2);
+
+      host_addition<Torus>(streams[0], gpu_indexes[0], &next_input_block,
+                           &next_input_block, &second_input, 1);
     }
   }
 }
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -319,7 +319,7 @@ unsafe extern "C" {
         streams: *const *mut ffi::c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        input_blocks: *mut ffi::c_void,
+        input_blocks: *mut CudaRadixCiphertextFFI,
         mem_ptr: *mut i8,
         ksks: *const *mut ffi::c_void,
         bsks: *const *mut ffi::c_void,
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
@@ -1249,7 +1249,7 @@ pub unsafe fn unchecked_scalar_comparison_integer_radix_kb_async<T: UnsignedInte
 ///   is required
 pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
     streams: &CudaStreams,
-    radix_lwe_input: &mut CudaVec<T>,
+    radix_lwe_input: &mut CudaRadixCiphertext,
     bootstrapping_key: &CudaVec<B>,
     keyswitch_key: &CudaVec<T>,
     lwe_dimension: LweDimension,
@@ -1267,7 +1267,7 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
 ) {
     assert_eq!(
         streams.gpu_indexes[0],
-        radix_lwe_input.gpu_index(0),
+        radix_lwe_input.d_blocks.0.d_vec.gpu_index(0),
         "GPU error: all data should reside on the same GPU."
     );
     assert_eq!(
@@ -1281,6 +1281,23 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
         "GPU error: all data should reside on the same GPU."
     );
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let mut radix_lwe_input_degrees = radix_lwe_input
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.0)
+        .collect();
+    let mut radix_lwe_input_noise_levels = radix_lwe_input
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut cuda_ffi_radix_lwe_input = prepare_cuda_radix_ffi(
+        radix_lwe_input,
+        &mut radix_lwe_input_degrees,
+        &mut radix_lwe_input_noise_levels,
+    );
     scratch_cuda_full_propagation_64(
         streams.ptr.as_ptr(),
         streams.gpu_indexes_ptr(),
@@ -1303,7 +1320,7 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
         streams.ptr.as_ptr(),
         streams.gpu_indexes_ptr(),
         streams.len() as u32,
-        radix_lwe_input.as_mut_c_ptr(0),
+        &mut cuda_ffi_radix_lwe_input,
         mem_ptr,
         keyswitch_key.ptr.as_ptr(),
         bootstrapping_key.ptr.as_ptr(),
@@ -1315,6 +1332,7 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
         streams.len() as u32,
         std::ptr::addr_of_mut!(mem_ptr),
     );
+    update_noise_degree(radix_lwe_input, &cuda_ffi_radix_lwe_input);
 }
 
 #[allow(clippy::too_many_arguments)]
diff --git a/tfhe/src/integer/gpu/server_key/radix/add.rs b/tfhe/src/integer/gpu/server_key/radix/add.rs
@@ -569,7 +569,7 @@ impl CudaServerKey {
         let output_flag = OutputFlag::from_signedness(CudaSignedRadixCiphertext::IS_SIGNED);
 
         let mut ct_res = lhs.duplicate_async(stream);
-        let mut carry_out: CudaSignedRadixCiphertext = self
+        let carry_out: CudaSignedRadixCiphertext = self
             .add_and_propagate_single_carry_assign_async(
                 &mut ct_res,
                 rhs,
@@ -578,14 +578,6 @@ impl CudaServerKey {
                 output_flag,
             );
 
-        if lhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
-            && rhs.as_ref().info.blocks.last().unwrap().noise_level == NoiseLevel::ZERO
-        {
-            carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::ZERO);
-        } else {
-            carry_out.as_mut().info = carry_out.as_ref().info.boolean_info(NoiseLevel::NOMINAL);
-        }
-
         let ct_overflowed = CudaBooleanBlock::from_cuda_radix_ciphertext(carry_out.ciphertext);
 
         (ct_res, ct_overflowed)
diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -383,7 +383,7 @@ impl CudaServerKey {
                 CudaBootstrappingKey::Classic(d_bsk) => {
                     full_propagate_assign_async(
                         streams,
-                        &mut ciphertext.d_blocks.0.d_vec,
+                        ciphertext,
                         &d_bsk.d_vec,
                         &self.key_switching_key.d_vec,
                         d_bsk.input_lwe_dimension(),
@@ -403,7 +403,7 @@ impl CudaServerKey {
                 CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                     full_propagate_assign_async(
                         streams,
-                        &mut ciphertext.d_blocks.0.d_vec,
+                        ciphertext,
                         &d_multibit_bsk.d_vec,
                         &self.key_switching_key.d_vec,
                         d_multibit_bsk.input_lwe_dimension(),
@@ -422,14 +422,6 @@ impl CudaServerKey {
                 }
             }
         }
-        ciphertext.info.blocks.iter_mut().for_each(|b| {
-            b.degree = Degree::new(b.message_modulus.0 - 1);
-            b.noise_level = if b.noise_level == NoiseLevel::ZERO {
-                NoiseLevel::ZERO
-            } else {
-                NoiseLevel::NOMINAL
-            };
-        });
     }
 
     /// Prepend trivial zero LSB blocks to an existing [`CudaUnsignedRadixCiphertext`] or
diff --git a/tfhe/src/integer/server_key/radix_parallel/tests_signed/test_scalar_add.rs b/tfhe/src/integer/server_key/radix_parallel/tests_signed/test_scalar_add.rs
@@ -295,6 +295,10 @@ where
                 expected overflow flag {expected_overflowed}, got {decrypted_overflowed}"
         );
             assert_eq!(encrypted_overflow.0.degree.get(), 1);
+            #[cfg(feature = "gpu")]
+            assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::NOMINAL);
+
+            #[cfg(not(feature = "gpu"))]
             assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::ZERO);
         }
 
diff --git a/tfhe/src/integer/server_key/radix_parallel/tests_signed/test_scalar_sub.rs b/tfhe/src/integer/server_key/radix_parallel/tests_signed/test_scalar_sub.rs
@@ -224,6 +224,10 @@ where
                 expected overflow flag {expected_overflowed}, got {decrypted_overflowed}"
         );
         assert_eq!(encrypted_overflow.0.degree.get(), 1);
+        #[cfg(feature = "gpu")]
+        assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::NOMINAL);
+
+        #[cfg(not(feature = "gpu"))]
         assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::ZERO);
     }
 
diff --git a/tfhe/src/integer/server_key/radix_parallel/tests_signed/test_sub.rs b/tfhe/src/integer/server_key/radix_parallel/tests_signed/test_sub.rs
@@ -251,6 +251,10 @@ where
                 expected overflow flag {expected_overflowed}, got {decrypted_overflowed}"
             );
             assert_eq!(encrypted_overflow.0.degree.get(), 1);
+            #[cfg(feature = "gpu")]
+            assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::NOMINAL);
+
+            #[cfg(not(feature = "gpu"))]
             assert_eq!(encrypted_overflow.0.noise_level(), NoiseLevel::ZERO);
         }
     }