@@ -226,36 +226,37 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
226226
227227 cudaStream_t rawStream = at::cuda::getCurrentCUDAStream ().stream ();
228228
229- // Build an NppStreamContext, either via the old helper or by hand on CUDA 12.9+
229+ // Build an NppStreamContext, either via the old helper or by hand on
230+ // CUDA 12.9+
230231 NppStreamContext nppCtx{};
231- #if CUDA_VERSION < 12090
232- NppStatus ctxStat = nppGetStreamContext (&nppCtx);
233- TORCH_CHECK (ctxStat == NPP_SUCCESS, " nppGetStreamContext failed" );
234- // override if you want to force a particular stream
235- nppCtx.hStream = rawStream;
236- #else
237- // CUDA 12.9+: helper was removed, we need to build it manually
238- int dev = 0 ;
239- cudaError_t err = cudaGetDevice (&dev);
240- TORCH_CHECK (err == cudaSuccess, " cudaGetDevice failed" );
241- cudaDeviceProp prop{};
242- err = cudaGetDeviceProperties (&prop, dev);
243- TORCH_CHECK (err == cudaSuccess, " cudaGetDeviceProperties failed" );
244-
245- nppCtx.nCudaDeviceId = dev;
246- nppCtx.nMultiProcessorCount = prop.multiProcessorCount ;
247- nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor ;
248- nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock ;
249- nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock ;
250- nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major ;
251- nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor ;
252- nppCtx.nStreamFlags = 0 ;
253- nppCtx.hStream = rawStream;
254- #endif
232+ #if CUDA_VERSION < 12090
233+ NppStatus ctxStat = nppGetStreamContext (&nppCtx);
234+ TORCH_CHECK (ctxStat == NPP_SUCCESS, " nppGetStreamContext failed" );
235+ // override if you want to force a particular stream
236+ nppCtx.hStream = rawStream;
237+ #else
238+ // CUDA 12.9+: helper was removed, we need to build it manually
239+ int dev = 0 ;
240+ cudaError_t err = cudaGetDevice (&dev);
241+ TORCH_CHECK (err == cudaSuccess, " cudaGetDevice failed" );
242+ cudaDeviceProp prop{};
243+ err = cudaGetDeviceProperties (&prop, dev);
244+ TORCH_CHECK (err == cudaSuccess, " cudaGetDeviceProperties failed" );
245+
246+ nppCtx.nCudaDeviceId = dev;
247+ nppCtx.nMultiProcessorCount = prop.multiProcessorCount ;
248+ nppCtx.nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor ;
249+ nppCtx.nMaxThreadsPerBlock = prop.maxThreadsPerBlock ;
250+ nppCtx.nSharedMemPerBlock = prop.sharedMemPerBlock ;
251+ nppCtx.nCudaDevAttrComputeCapabilityMajor = prop.major ;
252+ nppCtx.nCudaDevAttrComputeCapabilityMinor = prop.minor ;
253+ nppCtx.nStreamFlags = 0 ;
254+ nppCtx.hStream = rawStream;
255+ #endif
255256
256257 // Prepare ROI + pointers
257- NppiSize oSizeROI = { width, height };
258- Npp8u* input[2 ] = { avFrame->data [0 ], avFrame->data [1 ] };
258+ NppiSize oSizeROI = {width, height};
259+ Npp8u* input[2 ] = {avFrame->data [0 ], avFrame->data [1 ]};
259260
260261 auto start = std::chrono::high_resolution_clock::now ();
261262 NppStatus status;
@@ -281,10 +282,8 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput(
281282
282283 auto end = std::chrono::high_resolution_clock::now ();
283284 auto duration = std::chrono::duration<double , std::micro>(end - start);
284- VLOG (9 ) << " NPP Conversion of frame h=" << height
285- << " w=" << width
285+ VLOG (9 ) << " NPP Conversion of frame h=" << height << " w=" << width
286286 << " took: " << duration.count () << " us" ;
287-
288287}
289288
290289// inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9
0 commit comments