Skip to content

Commit ce327b7

Browse files
committed
chore(gpu): refactor mul/scalar mul to track noise/degree
1 parent 877d023 commit ce327b7

File tree

21 files changed

+523
-544
lines changed

21 files changed

+523
-544
lines changed

backends/tfhe-cuda-backend/cuda/include/integer/integer.h

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,11 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
132132

133133
void cuda_integer_mult_radix_ciphertext_kb_64(
134134
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
135-
void *radix_lwe_out, void const *radix_lwe_left, bool const is_bool_left,
136-
void const *radix_lwe_right, bool const is_bool_right, void *const *bsks,
137-
void *const *ksks, int8_t *mem_ptr, uint32_t polynomial_size,
138-
uint32_t num_blocks);
135+
CudaRadixCiphertextFFI *radix_lwe_out,
136+
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
137+
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
138+
void *const *bsks, void *const *ksks, int8_t *mem_ptr,
139+
uint32_t polynomial_size, uint32_t num_blocks);
139140

140141
void cleanup_cuda_integer_mult(void *const *streams,
141142
uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -375,9 +376,9 @@ void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
375376

376377
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
377378
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
378-
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
379-
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
380-
uint32_t num_blocks_in_radix);
379+
CudaRadixCiphertextFFI *radix_lwe_out,
380+
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
381+
void *const *ksks);
381382

382383
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
383384
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -393,10 +394,10 @@ void scratch_cuda_integer_scalar_mul_kb_64(
393394

394395
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
395396
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
396-
void *lwe_array, uint64_t const *decomposed_scalar,
397+
CudaRadixCiphertextFFI *lwe_array, uint64_t const *decomposed_scalar,
397398
uint64_t const *has_at_least_one_set, int8_t *mem_ptr, void *const *bsks,
398-
void *const *ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
399-
uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars);
399+
void *const *ksks, uint32_t polynomial_size, uint32_t message_modulus,
400+
uint32_t num_scalars);
400401

401402
void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
402403
uint32_t const *gpu_indexes,

backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h

Lines changed: 71 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ template <typename Torus>
2121
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
2222
uint32_t value, uint32_t blocks_count,
2323
uint32_t lwe_size);
24-
void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
24+
void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,
2525
size_t *h_lwe_idx_out,
2626
int32_t *h_smart_copy_in,
2727
int32_t *h_smart_copy_out, size_t ch_amount,
@@ -1161,10 +1161,10 @@ template <typename Torus> struct int_overflowing_sub_memory {
11611161
};
11621162

11631163
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
1164-
Torus *new_blocks;
1165-
Torus *new_blocks_copy;
1166-
Torus *old_blocks;
1167-
Torus *small_lwe_vector;
1164+
CudaRadixCiphertextFFI *new_blocks;
1165+
CudaRadixCiphertextFFI *new_blocks_copy;
1166+
CudaRadixCiphertextFFI *old_blocks;
1167+
CudaRadixCiphertextFFI *small_lwe_vector;
11681168
int_radix_params params;
11691169

11701170
int32_t *d_smart_copy_in;
@@ -1183,34 +1183,22 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
11831183
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
11841184

11851185
// allocate gpu memory for intermediate buffers
1186-
new_blocks = (Torus *)cuda_malloc_async(
1187-
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
1188-
streams[0], gpu_indexes[0]);
1189-
new_blocks_copy = (Torus *)cuda_malloc_async(
1190-
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
1191-
streams[0], gpu_indexes[0]);
1192-
old_blocks = (Torus *)cuda_malloc_async(
1193-
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
1194-
streams[0], gpu_indexes[0]);
1195-
small_lwe_vector = (Torus *)cuda_malloc_async(
1196-
max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus),
1197-
streams[0], gpu_indexes[0]);
1198-
cuda_memset_async(new_blocks, 0,
1199-
max_pbs_count * (params.big_lwe_dimension + 1) *
1200-
sizeof(Torus),
1201-
streams[0], gpu_indexes[0]);
1202-
cuda_memset_async(new_blocks_copy, 0,
1203-
max_pbs_count * (params.big_lwe_dimension + 1) *
1204-
sizeof(Torus),
1205-
streams[0], gpu_indexes[0]);
1206-
cuda_memset_async(old_blocks, 0,
1207-
max_pbs_count * (params.big_lwe_dimension + 1) *
1208-
sizeof(Torus),
1209-
streams[0], gpu_indexes[0]);
1210-
cuda_memset_async(small_lwe_vector, 0,
1211-
max_pbs_count * (params.small_lwe_dimension + 1) *
1212-
sizeof(Torus),
1213-
streams[0], gpu_indexes[0]);
1186+
new_blocks = new CudaRadixCiphertextFFI;
1187+
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
1188+
new_blocks, max_pbs_count,
1189+
params.big_lwe_dimension);
1190+
new_blocks_copy = new CudaRadixCiphertextFFI;
1191+
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
1192+
new_blocks_copy, max_pbs_count,
1193+
params.big_lwe_dimension);
1194+
old_blocks = new CudaRadixCiphertextFFI;
1195+
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
1196+
old_blocks, max_pbs_count,
1197+
params.big_lwe_dimension);
1198+
small_lwe_vector = new CudaRadixCiphertextFFI;
1199+
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
1200+
small_lwe_vector, max_pbs_count,
1201+
params.small_lwe_dimension);
12141202

12151203
d_smart_copy_in = (int32_t *)cuda_malloc_async(
12161204
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
@@ -1227,8 +1215,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
12271215
uint32_t gpu_count, int_radix_params params,
12281216
uint32_t num_blocks_in_radix,
12291217
uint32_t max_num_radix_in_vec,
1230-
Torus *new_blocks, Torus *old_blocks,
1231-
Torus *small_lwe_vector) {
1218+
CudaRadixCiphertextFFI *new_blocks,
1219+
CudaRadixCiphertextFFI *old_blocks,
1220+
CudaRadixCiphertextFFI *small_lwe_vector) {
12321221
mem_reuse = true;
12331222
this->params = params;
12341223

@@ -1238,13 +1227,10 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
12381227
this->new_blocks = new_blocks;
12391228
this->old_blocks = old_blocks;
12401229
this->small_lwe_vector = small_lwe_vector;
1241-
new_blocks_copy = (Torus *)cuda_malloc_async(
1242-
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
1243-
streams[0], gpu_indexes[0]);
1244-
cuda_memset_async(new_blocks_copy, 0,
1245-
max_pbs_count * (params.big_lwe_dimension + 1) *
1246-
sizeof(Torus),
1247-
streams[0], gpu_indexes[0]);
1230+
new_blocks_copy = new CudaRadixCiphertextFFI;
1231+
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
1232+
new_blocks_copy, max_pbs_count,
1233+
params.big_lwe_dimension);
12481234

12491235
d_smart_copy_in = (int32_t *)cuda_malloc_async(
12501236
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
@@ -1262,12 +1248,15 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
12621248
cuda_drop_async(d_smart_copy_out, streams[0], gpu_indexes[0]);
12631249

12641250
if (!mem_reuse) {
1265-
cuda_drop_async(new_blocks, streams[0], gpu_indexes[0]);
1266-
cuda_drop_async(old_blocks, streams[0], gpu_indexes[0]);
1267-
cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
1251+
release_radix_ciphertext(streams[0], gpu_indexes[0], new_blocks);
1252+
delete new_blocks;
1253+
release_radix_ciphertext(streams[0], gpu_indexes[0], old_blocks);
1254+
delete old_blocks;
1255+
release_radix_ciphertext(streams[0], gpu_indexes[0], small_lwe_vector);
1256+
delete small_lwe_vector;
12681257
}
1269-
1270-
cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]);
1258+
release_radix_ciphertext(streams[0], gpu_indexes[0], new_blocks_copy);
1259+
delete new_blocks_copy;
12711260
}
12721261
};
12731262
// For sequential algorithm in group propagation
@@ -2482,7 +2471,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
24822471

24832472
int_radix_params params;
24842473

2485-
Torus *tmp;
2474+
CudaRadixCiphertextFFI *tmp;
24862475

24872476
cudaStream_t *true_streams;
24882477
cudaStream_t *false_streams;
@@ -2495,10 +2484,11 @@ template <typename Torus> struct int_zero_out_if_buffer {
24952484
this->params = params;
24962485
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
24972486

2498-
Torus big_size =
2499-
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
25002487
if (allocate_gpu_memory) {
2501-
tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
2488+
tmp = new CudaRadixCiphertextFFI;
2489+
create_zero_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
2490+
num_radix_blocks,
2491+
params.big_lwe_dimension);
25022492
// We may use a different stream to allow concurrent operation
25032493
true_streams =
25042494
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
@@ -2512,7 +2502,8 @@ template <typename Torus> struct int_zero_out_if_buffer {
25122502
}
25132503
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
25142504
uint32_t gpu_count) {
2515-
cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
2505+
release_radix_ciphertext(streams[0], gpu_indexes[0], tmp);
2506+
delete tmp;
25162507
for (uint j = 0; j < active_gpu_count; j++) {
25172508
cuda_destroy_stream(true_streams[j], gpu_indexes[j]);
25182509
cuda_destroy_stream(false_streams[j], gpu_indexes[j]);
@@ -2523,9 +2514,9 @@ template <typename Torus> struct int_zero_out_if_buffer {
25232514
};
25242515

25252516
template <typename Torus> struct int_mul_memory {
2526-
Torus *vector_result_sb;
2527-
Torus *block_mul_res;
2528-
Torus *small_lwe_vector;
2517+
CudaRadixCiphertextFFI *vector_result_sb;
2518+
CudaRadixCiphertextFFI *block_mul_res;
2519+
CudaRadixCiphertextFFI *small_lwe_vector;
25292520

25302521
int_radix_lut<Torus> *luts_array; // lsb msb
25312522
int_radix_lut<Torus> *zero_out_predicate_lut;
@@ -2574,7 +2565,6 @@ template <typename Torus> struct int_mul_memory {
25742565
auto polynomial_size = params.polynomial_size;
25752566
auto message_modulus = params.message_modulus;
25762567
auto carry_modulus = params.carry_modulus;
2577-
auto lwe_dimension = params.small_lwe_dimension;
25782568

25792569
// 'vector_result_lsb' contains blocks from all possible shifts of
25802570
// radix_lwe_left excluding zero ciphertext blocks
@@ -2587,17 +2577,18 @@ template <typename Torus> struct int_mul_memory {
25872577
int total_block_count = lsb_vector_block_count + msb_vector_block_count;
25882578

25892579
// allocate memory for intermediate buffers
2590-
vector_result_sb = (Torus *)cuda_malloc_async(
2591-
2 * total_block_count * (polynomial_size * glwe_dimension + 1) *
2592-
sizeof(Torus),
2593-
streams[0], gpu_indexes[0]);
2594-
block_mul_res = (Torus *)cuda_malloc_async(
2595-
2 * total_block_count * (polynomial_size * glwe_dimension + 1) *
2596-
sizeof(Torus),
2597-
streams[0], gpu_indexes[0]);
2598-
small_lwe_vector = (Torus *)cuda_malloc_async(
2599-
total_block_count * (lwe_dimension + 1) * sizeof(Torus), streams[0],
2600-
gpu_indexes[0]);
2580+
vector_result_sb = new CudaRadixCiphertextFFI;
2581+
create_zero_radix_ciphertext_async<Torus>(
2582+
streams[0], gpu_indexes[0], vector_result_sb, 2 * total_block_count,
2583+
params.big_lwe_dimension);
2584+
block_mul_res = new CudaRadixCiphertextFFI;
2585+
create_zero_radix_ciphertext_async<Torus>(
2586+
streams[0], gpu_indexes[0], block_mul_res, 2 * total_block_count,
2587+
params.big_lwe_dimension);
2588+
small_lwe_vector = new CudaRadixCiphertextFFI;
2589+
create_zero_radix_ciphertext_async<Torus>(
2590+
streams[0], gpu_indexes[0], small_lwe_vector, total_block_count,
2591+
params.small_lwe_dimension);
26012592

26022593
// create int_radix_lut objects for lsb, msb, message, carry
26032594
// luts_array -> lut = {lsb_acc, msb_acc}
@@ -2658,9 +2649,12 @@ template <typename Torus> struct int_mul_memory {
26582649

26592650
return;
26602651
}
2661-
cuda_drop_async(vector_result_sb, streams[0], gpu_indexes[0]);
2662-
cuda_drop_async(block_mul_res, streams[0], gpu_indexes[0]);
2663-
cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
2652+
release_radix_ciphertext(streams[0], gpu_indexes[0], vector_result_sb);
2653+
delete vector_result_sb;
2654+
release_radix_ciphertext(streams[0], gpu_indexes[0], block_mul_res);
2655+
delete block_mul_res;
2656+
release_radix_ciphertext(streams[0], gpu_indexes[0], small_lwe_vector);
2657+
delete small_lwe_vector;
26642658

26652659
luts_array->release(streams, gpu_indexes, gpu_count);
26662660
sum_ciphertexts_mem->release(streams, gpu_indexes, gpu_count);
@@ -4435,7 +4429,7 @@ template <typename Torus> struct int_scalar_mul_buffer {
44354429
int_radix_params params;
44364430
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_buffer;
44374431
int_sum_ciphertexts_vec_memory<Torus> *sum_ciphertexts_vec_mem;
4438-
Torus *preshifted_buffer;
4432+
CudaRadixCiphertextFFI *preshifted_buffer;
44394433
CudaRadixCiphertextFFI *all_shifted_buffer;
44404434
int_sc_prop_memory<Torus> *sc_prop_mem;
44414435
bool anticipated_buffers_drop;
@@ -4450,25 +4444,21 @@ template <typename Torus> struct int_scalar_mul_buffer {
44504444

44514445
if (allocate_gpu_memory) {
44524446
uint32_t msg_bits = (uint32_t)std::log2(params.message_modulus);
4453-
uint32_t lwe_size = params.big_lwe_dimension + 1;
4454-
uint32_t lwe_size_bytes = lwe_size * sizeof(Torus);
44554447
size_t num_ciphertext_bits = msg_bits * num_radix_blocks;
44564448

44574449
//// Contains all shifted values of lhs for shift in range (0..msg_bits)
44584450
//// The idea is that with these we can create all other shift that are
44594451
/// in / range (0..total_bits) for free (block rotation)
4460-
preshifted_buffer = (Torus *)cuda_malloc_async(
4461-
num_ciphertext_bits * lwe_size_bytes, streams[0], gpu_indexes[0]);
4452+
preshifted_buffer = new CudaRadixCiphertextFFI;
4453+
create_zero_radix_ciphertext_async<Torus>(
4454+
streams[0], gpu_indexes[0], preshifted_buffer, num_ciphertext_bits,
4455+
params.big_lwe_dimension);
44624456

44634457
all_shifted_buffer = new CudaRadixCiphertextFFI;
44644458
create_zero_radix_ciphertext_async<Torus>(
44654459
streams[0], gpu_indexes[0], all_shifted_buffer,
44664460
num_ciphertext_bits * num_radix_blocks, params.big_lwe_dimension);
44674461

4468-
cuda_memset_async(preshifted_buffer, 0,
4469-
num_ciphertext_bits * lwe_size_bytes, streams[0],
4470-
gpu_indexes[0]);
4471-
44724462
if (num_ciphertext_bits * num_radix_blocks >= num_radix_blocks + 2)
44734463
logical_scalar_shift_buffer =
44744464
new int_logical_scalar_shift_buffer<Torus>(
@@ -4500,7 +4490,8 @@ template <typename Torus> struct int_scalar_mul_buffer {
45004490
release_radix_ciphertext(streams[0], gpu_indexes[0], all_shifted_buffer);
45014491
delete all_shifted_buffer;
45024492
if (!anticipated_buffers_drop) {
4503-
cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
4493+
release_radix_ciphertext(streams[0], gpu_indexes[0], preshifted_buffer);
4494+
delete preshifted_buffer;
45044495
logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
45054496
delete (logical_scalar_shift_buffer);
45064497
}

backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,20 @@
77
template <typename Torus>
88
__host__ void zero_out_if(cudaStream_t const *streams,
99
uint32_t const *gpu_indexes, uint32_t gpu_count,
10-
Torus *lwe_array_out, Torus const *lwe_array_input,
11-
Torus const *lwe_condition,
10+
CudaRadixCiphertextFFI *lwe_array_out,
11+
CudaRadixCiphertextFFI const *lwe_array_input,
12+
CudaRadixCiphertextFFI const *lwe_condition,
1213
int_zero_out_if_buffer<Torus> *mem_ptr,
1314
int_radix_lut<Torus> *predicate, void *const *bsks,
1415
Torus *const *ksks, uint32_t num_radix_blocks) {
16+
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
17+
lwe_array_input->num_radix_blocks < num_radix_blocks)
18+
PANIC("Cuda error: input or output radix ciphertexts does not have enough "
19+
"blocks")
20+
if (lwe_array_out->lwe_dimension != lwe_array_input->lwe_dimension ||
21+
lwe_array_input->lwe_dimension != lwe_condition->lwe_dimension)
22+
PANIC("Cuda error: input and output radix ciphertexts must have the same "
23+
"lwe dimension")
1524
cuda_set_device(gpu_indexes[0]);
1625
auto params = mem_ptr->params;
1726

@@ -21,12 +30,11 @@ __host__ void zero_out_if(cudaStream_t const *streams,
2130
host_pack_bivariate_blocks_with_single_block<Torus>(
2231
streams, gpu_indexes, gpu_count, tmp_lwe_array_input,
2332
predicate->lwe_indexes_in, lwe_array_input, lwe_condition,
24-
predicate->lwe_indexes_in, params.big_lwe_dimension,
25-
params.message_modulus, num_radix_blocks);
33+
predicate->lwe_indexes_in, params.message_modulus, num_radix_blocks);
2634

27-
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
35+
integer_radix_apply_univariate_lookup_table_kb<Torus>(
2836
streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
29-
ksks, num_radix_blocks, predicate);
37+
ksks, predicate, num_radix_blocks);
3038
}
3139

3240
template <typename Torus>

backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -557,18 +557,29 @@ __global__ void device_pack_bivariate_blocks_with_single_block(
557557
template <typename Torus>
558558
__host__ void host_pack_bivariate_blocks_with_single_block(
559559
cudaStream_t const *streams, uint32_t const *gpu_indexes,
560-
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_indexes_out,
561-
Torus const *lwe_array_1, Torus const *lwe_2, Torus const *lwe_indexes_in,
562-
uint32_t lwe_dimension, uint32_t shift, uint32_t num_radix_blocks) {
560+
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
561+
Torus const *lwe_indexes_out, CudaRadixCiphertextFFI const *lwe_array_1,
562+
CudaRadixCiphertextFFI const *lwe_2, Torus const *lwe_indexes_in,
563+
uint32_t shift, uint32_t num_radix_blocks) {
563564

565+
if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
566+
lwe_array_1->num_radix_blocks < num_radix_blocks)
567+
PANIC("Cuda error: input or output radix ciphertexts does not have enough "
568+
"blocks")
569+
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
570+
lwe_array_1->lwe_dimension != lwe_2->lwe_dimension)
571+
PANIC("Cuda error: input and output radix ciphertexts must have the same "
572+
"lwe dimension")
573+
auto lwe_dimension = lwe_array_out->lwe_dimension;
564574
cuda_set_device(gpu_indexes[0]);
565575
// Left message is shifted
566576
int num_blocks = 0, num_threads = 0;
567577
int num_entries = num_radix_blocks * (lwe_dimension + 1);
568578
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
569579
device_pack_bivariate_blocks_with_single_block<Torus>
570580
<<<num_blocks, num_threads, 0, streams[0]>>>(
571-
lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_2, lwe_indexes_in,
581+
(Torus *)lwe_array_out->ptr, lwe_indexes_out,
582+
(Torus *)lwe_array_1->ptr, (Torus *)lwe_2->ptr, lwe_indexes_in,
572583
lwe_dimension, shift, num_radix_blocks);
573584
check_cuda_error(cudaGetLastError());
574585
}

0 commit comments

Comments
 (0)