Skip to content

Commit 1ae7155

Browse files
committed
move stream from encoder/decoder to coder
it is common for both
1 parent eb3b779 commit 1ae7155

12 files changed

+128
-118
lines changed

src/gpujpeg_common_internal.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,9 @@ struct gpujpeg_coder
421421
size_t allocated_gpu_memory_size; ///< for gpujpeg_encoder_max_pixels() only (remove?)
422422

423423
int encoder; ///< 1 if we are encoder, 0 decoder
424+
425+
// Stream
426+
cudaStream_t stream;
424427
};
425428

426429
/**

src/gpujpeg_dct_gpu.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/**
22
* @file
3-
* Copyright (c) 2011-2020, CESNET z.s.p.o
3+
* Copyright (c) 2011-2025, CESNET
44
* Copyright (c) 2011, Silicon Genome, LLC.
55
*
66
* All rights reserved.
@@ -641,7 +641,7 @@ gpujpeg_dct_gpu(struct gpujpeg_encoder* encoder)
641641
sizeof(gpujpeg_dct_gpu_quantization_table_const),
642642
0,
643643
cudaMemcpyDeviceToDevice,
644-
encoder->stream
644+
coder->stream
645645
);
646646
gpujpeg_cuda_check_error("Quantization table memcpy failed", return -1);
647647
}
@@ -662,7 +662,7 @@ gpujpeg_dct_gpu(struct gpujpeg_encoder* encoder)
662662
1
663663
);
664664
dim3 dct_block(4 * 8, WARP_COUNT);
665-
gpujpeg_dct_gpu_kernel<WARP_COUNT><<<dct_grid, dct_block, 0, encoder->stream>>>(
665+
gpujpeg_dct_gpu_kernel<WARP_COUNT><<<dct_grid, dct_block, 0, coder->stream>>>(
666666
block_count_x,
667667
block_count_y,
668668
component->d_data,
@@ -706,15 +706,15 @@ gpujpeg_idct_gpu(struct gpujpeg_decoder* decoder)
706706
64 * sizeof(uint16_t),
707707
0,
708708
cudaMemcpyDeviceToDevice,
709-
decoder->stream
709+
coder->stream
710710
);
711711
gpujpeg_cuda_check_error("Copy IDCT quantization table to constant memory", return -1);
712712
713713
dim3 dct_grid(gpujpeg_div_and_round_up(block_count_x * block_count_y,
714714
(GPUJPEG_IDCT_BLOCK_X * GPUJPEG_IDCT_BLOCK_Y * GPUJPEG_IDCT_BLOCK_Z) / GPUJPEG_BLOCK_SIZE), 1);
715715
dim3 dct_block(GPUJPEG_IDCT_BLOCK_X, GPUJPEG_IDCT_BLOCK_Y, GPUJPEG_IDCT_BLOCK_Z);
716716
717-
gpujpeg_idct_gpu_kernel<<<dct_grid, dct_block, 0, decoder->stream>>>(
717+
gpujpeg_idct_gpu_kernel<<<dct_grid, dct_block, 0, coder->stream>>>(
718718
component->d_data_quantized,
719719
component->d_data,
720720
component->data_width,

src/gpujpeg_decoder.c

Lines changed: 39 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ gpujpeg_decoder_create(cudaStream_t stream)
146146
}
147147

148148
// Stream
149-
decoder->stream = stream;
149+
coder->stream = stream;
150150

151151
if (result == 0) {
152152
gpujpeg_decoder_destroy(decoder);
@@ -214,7 +214,7 @@ gpujpeg_decoder_init(struct gpujpeg_decoder* decoder, const struct gpujpeg_param
214214
fprintf(stderr, "[GPUJPEG] [Info] Reinitializing decoder.\n");
215215
}
216216

217-
if (0 == gpujpeg_coder_init_image(coder, param, param_image, decoder->stream)) {
217+
if (0 == gpujpeg_coder_init_image(coder, param, param_image, coder->stream)) {
218218
fprintf(stderr, "[GPUJPEG] [Error] Failed to init coder image!\n");
219219
return -1;
220220
}
@@ -241,15 +241,15 @@ gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, size_t i
241241

242242
coder->start_time = coder->param.perf_stats ? gpujpeg_get_time() : 0;
243243

244-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_stream, coder->param.perf_stats, decoder->stream, return -1);
244+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_stream, coder->param.perf_stats, coder->stream, return -1);
245245

246246
// Read JPEG image data
247247
if (0 != (rc = gpujpeg_reader_read_image(decoder, image, image_size))) {
248248
fprintf(stderr, "[GPUJPEG] [Error] Decoder failed when decoding image data!\n");
249249
return rc;
250250
}
251251

252-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_stream, coder->param.perf_stats, decoder->stream, return -1);
252+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_stream, coder->param.perf_stats, coder->stream, return -1);
253253

254254
// check if params is ok for GPU decoder
255255
for ( int i = 0; i < decoder->coder.param.comp_count; ++i ) {
@@ -273,7 +273,7 @@ gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, size_t i
273273

274274
// Perform huffman decoding on CPU (when there are not enough segments to saturate GPU)
275275
if ( use_cpu_huffman_decoder ) {
276-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_huffman_coder, coder->param.perf_stats, decoder->stream, return -1);
276+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_huffman_coder, coder->param.perf_stats, coder->stream, return -1);
277277
if (coder->data_quantized == NULL) {
278278
if (gpujpeg_coder_allocate_cpu_huffman_buf(coder) != 0) {
279279
return -1;
@@ -283,53 +283,55 @@ gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, size_t i
283283
fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder failed!\n");
284284
return -1;
285285
}
286-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_huffman_coder, coder->param.perf_stats, decoder->stream, return -1);
286+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_huffman_coder, coder->param.perf_stats, coder->stream, return -1);
287287

288288
// Copy quantized data to device memory from cpu memory
289-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_to, coder->param.perf_stats, decoder->stream, return -1);
290-
cudaMemcpyAsync(coder->d_data_quantized, coder->data_quantized, coder->data_size * sizeof(int16_t), cudaMemcpyHostToDevice, decoder->stream);
291-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_to, coder->param.perf_stats, decoder->stream, return -1);
289+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_to, coder->param.perf_stats, coder->stream, return -1);
290+
cudaMemcpyAsync(coder->d_data_quantized, coder->data_quantized, coder->data_size * sizeof(int16_t),
291+
cudaMemcpyHostToDevice, coder->stream);
292+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_to, coder->param.perf_stats, coder->stream, return -1);
292293

293-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_in_gpu, coder->param.perf_stats, decoder->stream, return -1);
294+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_in_gpu, coder->param.perf_stats, coder->stream, return -1);
294295
}
295296
// Perform huffman decoding on GPU (when there are enough segments to saturate GPU)
296297
else {
297-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_to, coder->param.perf_stats, decoder->stream, return -1);
298+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_to, coder->param.perf_stats, coder->stream, return -1);
298299

299300
// Reset huffman output
300-
cudaMemsetAsync(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t), decoder->stream);
301+
cudaMemsetAsync(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t), coder->stream);
301302

302303
// Copy scan data to device memory
303304
gpujpeg_cuda_memcpy_async_partially_pinned(coder->d_data_compressed, coder->data_compressed, decoder->data_compressed_size,
304-
cudaMemcpyHostToDevice, decoder->stream,
305+
cudaMemcpyHostToDevice, coder->stream,
305306
coder->data_compressed_pinned_sz);
306307
gpujpeg_cuda_check_error("Decoder copy compressed data to memory", return -1);
307308

308309
// Copy segments to device memory
309-
cudaMemcpyAsync(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice, decoder->stream);
310+
cudaMemcpyAsync(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment),
311+
cudaMemcpyHostToDevice, coder->stream);
310312
gpujpeg_cuda_check_error("Decoder copy compressed data", return -1);
311313

312-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_to, coder->param.perf_stats, decoder->stream, return -1);
314+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_to, coder->param.perf_stats, coder->stream, return -1);
313315

314-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_in_gpu, coder->param.perf_stats, decoder->stream, return -1);
315-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_huffman_coder, coder->param.perf_stats, decoder->stream, return -1);
316+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_in_gpu, coder->param.perf_stats, coder->stream, return -1);
317+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_huffman_coder, coder->param.perf_stats, coder->stream, return -1);
316318
// Perform huffman decoding
317319
if (0 != gpujpeg_huffman_gpu_decoder_decode(decoder)) {
318320
fprintf(stderr, "[GPUJPEG] [Error] Huffman decoder on GPU failed!\n");
319321
return -1;
320322
}
321-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_huffman_coder, coder->param.perf_stats, decoder->stream, return -1);
323+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_huffman_coder, coder->param.perf_stats, coder->stream, return -1);
322324
}
323325

324-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_dct_quantization, coder->param.perf_stats, decoder->stream, return -1);
326+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_dct_quantization, coder->param.perf_stats, coder->stream, return -1);
325327

326328
// Perform IDCT and dequantization (own CUDA implementation)
327329
if (0 != gpujpeg_idct_gpu(decoder)) {
328330
return -1;
329331
}
330332
// gpujpeg_idct_cpu(decoder);
331333

332-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_dct_quantization, coder->param.perf_stats, decoder->stream, return -1);
334+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_dct_quantization, coder->param.perf_stats, coder->stream, return -1);
333335

334336
// Reallocate buffers if not enough size
335337
if ( coder->data_raw_size > coder->data_raw_allocated_size ) {
@@ -356,33 +358,33 @@ gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, size_t i
356358
coder->d_data_raw = output->data;
357359
}
358360
else if (output->type == GPUJPEG_DECODER_OUTPUT_OPENGL_TEXTURE && output->texture->texture_callback_attach_opengl == NULL) {
359-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_map, coder->param.perf_stats, decoder->stream, return -1);
361+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_map, coder->param.perf_stats, coder->stream, return -1);
360362

361363
// Use OpenGL texture as decoding destination
362364
size_t data_size = 0;
363365
uint8_t* d_data = gpujpeg_opengl_texture_map(output->texture, &data_size);
364366
assert(data_size == coder->data_raw_size);
365367
coder->d_data_raw = d_data;
366368

367-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_map, coder->param.perf_stats, decoder->stream, return -1);
369+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_map, coder->param.perf_stats, coder->stream, return -1);
368370
}
369371
else {
370372
// Use internal CUDA buffer as decoding destination
371373
coder->d_data_raw = coder->d_data_raw_allocated;
372374
}
373375

374376
// Postprocessing
375-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_preprocessor, coder->param.perf_stats, decoder->stream, return -1);
376-
rc = gpujpeg_postprocessor_decode(&decoder->coder, decoder->stream);
377+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_preprocessor, coder->param.perf_stats, coder->stream, return -1);
378+
rc = gpujpeg_postprocessor_decode(&decoder->coder, coder->stream);
377379
if (rc != GPUJPEG_NOERR) {
378380
return rc;
379381
}
380-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_preprocessor, coder->param.perf_stats, decoder->stream, return -1);
382+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_preprocessor, coder->param.perf_stats, coder->stream, return -1);
381383

382384
// Wait for async operations before copying from the device
383-
cudaStreamSynchronize(decoder->stream);
385+
cudaStreamSynchronize(coder->stream);
384386

385-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_in_gpu, coder->param.perf_stats, decoder->stream, return -1);
387+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_in_gpu, coder->param.perf_stats, coder->stream, return -1);
386388

387389
// Set decompressed image size
388390
output->data_size = coder->data_raw_size * sizeof(uint8_t);
@@ -393,52 +395,52 @@ gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, size_t i
393395

394396
// Set decompressed image
395397
if (output->type == GPUJPEG_DECODER_OUTPUT_INTERNAL_BUFFER) {
396-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_from, coder->param.perf_stats, decoder->stream, return -1);
398+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_from, coder->param.perf_stats, coder->stream, return -1);
397399

398400
// Copy decompressed image to host memory
399401
cudaMemcpy(coder->data_raw, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
400402

401-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_from, coder->param.perf_stats, decoder->stream, return -1);
403+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_from, coder->param.perf_stats, coder->stream, return -1);
402404

403405
// Set output to internal buffer
404406
output->data = coder->data_raw;
405407
}
406408
else if (output->type == GPUJPEG_DECODER_OUTPUT_CUSTOM_BUFFER) {
407-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_from, coder->param.perf_stats, decoder->stream, return -1);
409+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_from, coder->param.perf_stats, coder->stream, return -1);
408410

409411
assert(output->data != NULL);
410412

411413
// Copy decompressed image to host memory
412414
cudaMemcpy(output->data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToHost);
413415

414-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_from, coder->param.perf_stats, decoder->stream, return -1);
416+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_from, coder->param.perf_stats, coder->stream, return -1);
415417
}
416418
else if (output->type == GPUJPEG_DECODER_OUTPUT_OPENGL_TEXTURE) {
417419
// If OpenGL texture wasn't mapped and used directly for decoding into it
418420
if (output->texture->texture_callback_attach_opengl != NULL) {
419-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_map, coder->param.perf_stats, decoder->stream, return -1);
421+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_map, coder->param.perf_stats, coder->stream, return -1);
420422

421423
// Map OpenGL texture
422424
size_t data_size = 0;
423425
uint8_t* d_data = gpujpeg_opengl_texture_map(output->texture, &data_size);
424426
assert(data_size == coder->data_raw_size);
425427

426-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_map, coder->param.perf_stats, decoder->stream, return -1);
428+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_map, coder->param.perf_stats, coder->stream, return -1);
427429

428-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_from, coder->param.perf_stats, decoder->stream, return -1);
430+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_from, coder->param.perf_stats, coder->stream, return -1);
429431

430432
// Copy decompressed image to texture pixel buffer object device data
431433
cudaMemcpy(d_data, coder->d_data_raw, coder->data_raw_size * sizeof(uint8_t), cudaMemcpyDeviceToDevice);
432434

433-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_from, coder->param.perf_stats, decoder->stream, return -1);
435+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_from, coder->param.perf_stats, coder->stream, return -1);
434436
}
435437

436-
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_unmap, coder->param.perf_stats, decoder->stream, return -1);
438+
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_unmap, coder->param.perf_stats, coder->stream, return -1);
437439

438440
// Unmap OpenGL texture
439441
gpujpeg_opengl_texture_unmap(output->texture);
440442

441-
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_unmap, coder->param.perf_stats, decoder->stream, return -1);
443+
GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_memory_unmap, coder->param.perf_stats, coder->stream, return -1);
442444
}
443445
else if (output->type == GPUJPEG_DECODER_OUTPUT_CUDA_BUFFER) {
444446
// Copy decompressed image to texture pixel buffer object device data

src/gpujpeg_decoder_internal.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/**
22
* @file
3-
* Copyright (c) 2011-2023, CESNET z.s.p.o
3+
* Copyright (c) 2011-2025, CESNET
44
* Copyright (c) 2011, Silicon Genome, LLC.
55
*
66
* All rights reserved.
@@ -65,9 +65,6 @@ struct gpujpeg_decoder
6565
/// Current data compressed size for decoded image
6666
size_t data_compressed_size;
6767

68-
// Stream
69-
cudaStream_t stream;
70-
7168
enum gpujpeg_pixel_format req_pixel_format;
7269
enum gpujpeg_color_space req_color_space;
7370
bool ff_cs_itu601_is_709; ///< if FFmpeg specific COM marker "CS=ITU601" present, interpret the data as

0 commit comments

Comments
 (0)