[fbsync] Encode jpeg cuda sync (#8929)

NicolasHug · w-m · facebook-github-bot · commit ee7e66b65210 · 2025-07-28T08:19:53.000-07:00
Reviewed By: scotts

Differential Revision: D77997051

fbshipit-source-id: 3b0fc22020c8930219f866ccd64679c5095962e4

Co-authored-by: Wieland Morgenstern &lt;wieland.morgenstern@hhi.fraunhofer.de&gt;
Co-authored-by: Nicolas Hug &lt;nicolashug@fb.com&gt;
Co-authored-by: Nicolas Hug &lt;nh.nicolas.hug@gmail.com&gt;
diff --git a/test/test_image.py b/test/test_image.py
@@ -623,6 +623,42 @@ def test_encode_jpeg_cuda(img_path, scripted, contiguous):
     assert abs_mean_diff < 3
 
 
+@needs_cuda
+def test_encode_jpeg_cuda_sync():
+    """
+    Non-regression test for https://github.com/pytorch/vision/issues/8587.
+    Attempts to reproduce an intermittent CUDA stream synchronization bug
+    by randomly creating images and round-tripping them via encode_jpeg
+    and decode_jpeg on the GPU. Fails if the mean difference in uint8 range
+    exceeds 5.
+    """
+    torch.manual_seed(42)
+
+    # manual testing shows this bug appearing often in iterations between 50 and 100
+    # as a synchronization bug, this can't be reliably reproduced
+    max_iterations = 100
+    threshold = 5.0  # in [0..255]
+
+    device = torch.device("cuda")
+
+    for iteration in range(max_iterations):
+        height, width = torch.randint(4000, 5000, size=(2,))
+
+        image = torch.linspace(0, 1, steps=height * width, device=device)
+        image = image.view(1, height, width).expand(3, -1, -1)
+
+        image = (image * 255).clamp(0, 255).to(torch.uint8)
+        jpeg_bytes = encode_jpeg(image, quality=100)
+
+        decoded_image = decode_jpeg(jpeg_bytes.cpu(), device=device)
+        mean_difference = (image.float() - decoded_image.float()).abs().mean().item()
+
+        assert mean_difference <= threshold, (
+            f"Encode/decode mismatch at iteration={iteration}, "
+            f"size={height}x{width}, mean diff={mean_difference:.2f}"
+        )
+
+
 @pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("scripted", (True, False))
 @pytest.mark.parametrize("contiguous", (True, False))
diff --git a/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.cpp b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.cpp
@@ -92,12 +92,12 @@ std::vector<torch::Tensor> encode_jpegs_cuda(
 
   cudaJpegEncoder->set_quality(quality);
   std::vector<torch::Tensor> encoded_images;
-  at::cuda::CUDAEvent event;
-  event.record(cudaJpegEncoder->stream);
   for (const auto& image : contig_images) {
     auto encoded_image = cudaJpegEncoder->encode_jpeg(image);
     encoded_images.push_back(encoded_image);
   }
+  at::cuda::CUDAEvent event;
+  event.record(cudaJpegEncoder->stream);
 
   // We use a dedicated stream to do the encoding and even though the results
   // may be ready on that stream we cannot assume that they are also available
@@ -106,10 +106,7 @@ std::vector<torch::Tensor> encode_jpegs_cuda(
   // do not want to block the host at this particular point
   // (which is what cudaStreamSynchronize would do.) Events allow us to
   // synchronize the streams without blocking the host.
-  event.block(at::cuda::getCurrentCUDAStream(
-      cudaJpegEncoder->original_device.has_index()
-          ? cudaJpegEncoder->original_device.index()
-          : 0));
+  event.block(cudaJpegEncoder->current_stream);
   return encoded_images;
 }
 
@@ -119,7 +116,11 @@ CUDAJpegEncoder::CUDAJpegEncoder(const torch::Device& target_device)
       stream{
           target_device.has_index()
               ? at::cuda::getStreamFromPool(false, target_device.index())
-              : at::cuda::getStreamFromPool(false)} {
+              : at::cuda::getStreamFromPool(false)},
+      current_stream{
+          original_device.has_index()
+              ? at::cuda::getCurrentCUDAStream(original_device.index())
+              : at::cuda::getCurrentCUDAStream()} {
   nvjpegStatus_t status;
   status = nvjpegCreateSimple(&nvjpeg_handle);
   TORCH_CHECK(
@@ -184,12 +185,17 @@ CUDAJpegEncoder::~CUDAJpegEncoder() {
 }
 
 torch::Tensor CUDAJpegEncoder::encode_jpeg(const torch::Tensor& src_image) {
+  nvjpegStatus_t status;
+  cudaError_t cudaStatus;
+
+  // Ensure that the incoming src_image is safe to use
+  cudaStatus = cudaStreamSynchronize(current_stream);
+  TORCH_CHECK(cudaStatus == cudaSuccess, "CUDA ERROR: ", cudaStatus);
+
   int channels = src_image.size(0);
   int height = src_image.size(1);
   int width = src_image.size(2);
 
-  nvjpegStatus_t status;
-  cudaError_t cudaStatus;
   status = nvjpegEncoderParamsSetSamplingFactors(
       nv_enc_params, NVJPEG_CSS_444, stream);
   TORCH_CHECK(
@@ -249,7 +255,7 @@ torch::Tensor CUDAJpegEncoder::encode_jpeg(const torch::Tensor& src_image) {
       nv_enc_state,
       encoded_image.data_ptr<uint8_t>(),
       &length,
-      0);
+      stream);
   TORCH_CHECK(
       status == NVJPEG_STATUS_SUCCESS,
       "Failed to retrieve encoded image: ",
diff --git a/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.h b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.h
@@ -22,6 +22,7 @@ class CUDAJpegEncoder {
   const torch::Device original_device;
   const torch::Device target_device;
   const c10::cuda::CUDAStream stream;
+  const c10::cuda::CUDAStream current_stream;
 
  protected:
   nvjpegEncoderState_t nv_enc_state;