diff --git a/Cargo.lock b/Cargo.lock index 4e294dca50..97bceb9bfa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1594,6 +1594,7 @@ dependencies = [ "metal 0.31.0", "objc2 0.6.2", "pretty_assertions", + "rayon", "reactive_graph", "resvg", "serde", @@ -1663,6 +1664,7 @@ dependencies = [ "cidre", "ffmpeg-hw-device", "ffmpeg-next", + "num_cpus", "tokio", "tracing", "windows 0.60.0", diff --git a/crates/frame-converter/src/d3d11.rs b/crates/frame-converter/src/d3d11.rs index 694c4868a3..60eaaefd45 100644 --- a/crates/frame-converter/src/d3d11.rs +++ b/crates/frame-converter/src/d3d11.rs @@ -113,15 +113,15 @@ pub struct D3D11Converter { fn get_gpu_info(device: &ID3D11Device) -> Result { unsafe { let dxgi_device: IDXGIDevice = device.cast().map_err(|e| { - ConvertError::HardwareUnavailable(format!("Failed to get DXGI device: {:?}", e)) + ConvertError::HardwareUnavailable(format!("Failed to get DXGI device: {e:?}")) })?; let adapter: IDXGIAdapter = dxgi_device.GetAdapter().map_err(|e| { - ConvertError::HardwareUnavailable(format!("Failed to get adapter: {:?}", e)) + ConvertError::HardwareUnavailable(format!("Failed to get adapter: {e:?}")) })?; let desc = adapter.GetDesc().map_err(|e| { - ConvertError::HardwareUnavailable(format!("Failed to get adapter description: {:?}", e)) + ConvertError::HardwareUnavailable(format!("Failed to get adapter description: {e:?}")) })?; let description = String::from_utf16_lossy( @@ -165,8 +165,7 @@ impl D3D11Converter { ) .map_err(|e| { ConvertError::HardwareUnavailable(format!( - "D3D11CreateDevice failed (no hardware GPU available?): {:?}", - e + "D3D11CreateDevice failed (no hardware GPU available?): {e:?}" )) })?; @@ -193,13 +192,12 @@ impl D3D11Converter { let video_device: ID3D11VideoDevice = device.cast().map_err(|e| { ConvertError::HardwareUnavailable(format!( - "GPU does not support D3D11 Video API (ID3D11VideoDevice): {:?}", - e + "GPU does not support D3D11 Video API (ID3D11VideoDevice): {e:?}" )) })?; let video_context: ID3D11VideoContext = context.cast().map_err(|e| { - ConvertError::HardwareUnavailable(format!("Failed to get ID3D11VideoContext: {:?}", e)) + ConvertError::HardwareUnavailable(format!("Failed to get ID3D11VideoContext: {e:?}")) })?; let content_desc = D3D11_VIDEO_PROCESSOR_CONTENT_DESC { @@ -225,8 +223,8 @@ impl D3D11Converter { .CreateVideoProcessorEnumerator(&content_desc) .map_err(|e| { ConvertError::HardwareUnavailable(format!( - "CreateVideoProcessorEnumerator failed (format {:?}->{:?} not supported by GPU?): {:?}", - config.input_format, config.output_format, e + "CreateVideoProcessorEnumerator failed (format {:?}->{:?} not supported by GPU?): {e:?}", + config.input_format, config.output_format )) })? }; @@ -235,10 +233,7 @@ impl D3D11Converter { video_device .CreateVideoProcessor(&enumerator, 0) .map_err(|e| { - ConvertError::HardwareUnavailable(format!( - "CreateVideoProcessor failed: {:?}", - e - )) + ConvertError::HardwareUnavailable(format!("CreateVideoProcessor failed: {e:?}")) })? }; @@ -351,9 +346,7 @@ impl FrameConverter for D3D11Converter { 0, Some(&mut mapped), ) - .map_err(|e| { - ConvertError::ConversionFailed(format!("Map input failed: {:?}", e)) - })?; + .map_err(|e| ConvertError::ConversionFailed(format!("Map input failed: {e:?}")))?; copy_frame_to_mapped(&input, mapped.pData as *mut u8, mapped.RowPitch as usize); @@ -385,7 +378,7 @@ impl FrameConverter for D3D11Converter { Some(&mut input_view), ) .map_err(|e| { - ConvertError::ConversionFailed(format!("CreateInputView failed: {:?}", e)) + ConvertError::ConversionFailed(format!("CreateInputView failed: {e:?}")) })?; let input_view = input_view.ok_or_else(|| { ConvertError::ConversionFailed("CreateInputView returned null".to_string()) @@ -411,7 +404,7 @@ impl FrameConverter for D3D11Converter { Some(&mut output_view), ) .map_err(|e| { - ConvertError::ConversionFailed(format!("CreateOutputView failed: {:?}", e)) + ConvertError::ConversionFailed(format!("CreateOutputView failed: {e:?}")) })?; let output_view = output_view.ok_or_else(|| { ConvertError::ConversionFailed("CreateOutputView returned null".to_string()) @@ -435,7 +428,7 @@ impl FrameConverter for D3D11Converter { .video_context .VideoProcessorBlt(&resources.processor, &output_view, 0, &[stream]) .map_err(|e| { - ConvertError::ConversionFailed(format!("VideoProcessorBlt failed: {:?}", e)) + ConvertError::ConversionFailed(format!("VideoProcessorBlt failed: {e:?}")) })?; if !self.verified_gpu_usage.swap(true, Ordering::Relaxed) { @@ -459,9 +452,7 @@ impl FrameConverter for D3D11Converter { 0, Some(&mut mapped), ) - .map_err(|e| { - ConvertError::ConversionFailed(format!("Map output failed: {:?}", e)) - })?; + .map_err(|e| ConvertError::ConversionFailed(format!("Map output failed: {e:?}")))?; let mut output = frame::Video::new(self.output_format, self.output_width, self.output_height); @@ -533,7 +524,7 @@ fn create_texture( device .CreateTexture2D(&desc, None, Some(&mut texture)) .map_err(|e| { - ConvertError::HardwareUnavailable(format!("CreateTexture2D failed: {:?}", e)) + ConvertError::HardwareUnavailable(format!("CreateTexture2D failed: {e:?}")) })?; texture.ok_or_else(|| { ConvertError::HardwareUnavailable("CreateTexture2D returned null".to_string()) diff --git a/crates/recording/src/output_pipeline/win.rs b/crates/recording/src/output_pipeline/win.rs index 9a801344ef..a9c5a756ec 100644 --- a/crates/recording/src/output_pipeline/win.rs +++ b/crates/recording/src/output_pipeline/win.rs @@ -212,8 +212,7 @@ impl Muxer for WindowsMuxer { Ok(guard) => guard, Err(poisoned) => { return fallback(Some(format!( - "Failed to lock output mutex: {}", - poisoned + "Failed to lock output mutex: {poisoned}" ))); } }; @@ -541,7 +540,7 @@ impl Muxer for WindowsCameraMuxer { let mut output_guard = match output.lock() { Ok(guard) => guard, Err(poisoned) => { - let msg = format!("Failed to lock output mutex: {}", poisoned); + let msg = format!("Failed to lock output mutex: {poisoned}"); let _ = ready_tx.send(Err(anyhow!("{}", msg))); return Err(anyhow!("{}", msg)); } diff --git a/crates/rendering/Cargo.toml b/crates/rendering/Cargo.toml index 6735427df8..5ef892c5ab 100644 --- a/crates/rendering/Cargo.toml +++ b/crates/rendering/Cargo.toml @@ -18,6 +18,7 @@ tokio.workspace = true ffmpeg.workspace = true futures = { workspace = true } futures-intrusive = "0.5.0" +rayon = "1.10" image = "0.25.2" log = "0.4" serde = { workspace = true } diff --git a/crates/rendering/src/cpu_yuv.rs b/crates/rendering/src/cpu_yuv.rs index df278ce8d8..7dbb0a26bb 100644 --- a/crates/rendering/src/cpu_yuv.rs +++ b/crates/rendering/src/cpu_yuv.rs @@ -1,3 +1,37 @@ +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; + +pub struct ConversionProgress { + pub rows_completed: AtomicUsize, + pub total_rows: usize, + pub cancelled: AtomicBool, +} + +impl ConversionProgress { + pub fn new(total_rows: usize) -> Self { + Self { + rows_completed: AtomicUsize::new(0), + total_rows, + cancelled: AtomicBool::new(false), + } + } + + pub fn progress_fraction(&self) -> f32 { + if self.total_rows == 0 { + return 1.0; + } + self.rows_completed.load(Ordering::Relaxed) as f32 / self.total_rows as f32 + } + + pub fn cancel(&self) { + self.cancelled.store(true, Ordering::Relaxed); + } + + pub fn is_cancelled(&self) -> bool { + self.cancelled.load(Ordering::Relaxed) + } +} + pub fn nv12_to_rgba( y_data: &[u8], uv_data: &[u8], @@ -92,6 +126,44 @@ pub fn yuv420p_to_rgba( } } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SimdLevel { + Scalar, + Sse2, + Avx2, +} + +impl SimdLevel { + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub fn detect() -> Self { + if is_x86_feature_detected!("avx2") { + SimdLevel::Avx2 + } else if is_x86_feature_detected!("sse2") { + SimdLevel::Sse2 + } else { + SimdLevel::Scalar + } + } + + #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] + pub fn detect() -> Self { + SimdLevel::Scalar + } + + pub fn pixels_per_iteration(self) -> usize { + match self { + SimdLevel::Avx2 => 16, + SimdLevel::Sse2 => 8, + SimdLevel::Scalar => 1, + } + } +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +const PARALLEL_THRESHOLD_PIXELS: usize = 1920 * 1080; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +const MIN_ROWS_PER_THREAD: usize = 16; + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] pub fn nv12_to_rgba_simd( y_data: &[u8], @@ -102,15 +174,23 @@ pub fn nv12_to_rgba_simd( uv_stride: u32, output: &mut [u8], ) { - #[cfg(target_arch = "x86")] - use std::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use std::arch::x86_64::*; - - if !is_x86_feature_detected!("sse2") { - return nv12_to_rgba(y_data, uv_data, width, height, y_stride, uv_stride, output); - } + nv12_to_rgba_simd_with_progress( + y_data, uv_data, width, height, y_stride, uv_stride, output, None, + ); +} +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[allow(clippy::too_many_arguments)] +pub fn nv12_to_rgba_simd_with_progress( + y_data: &[u8], + uv_data: &[u8], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, + output: &mut [u8], + progress: Option>, +) { let width_usize = width as usize; let height_usize = height as usize; let y_stride_usize = y_stride as usize; @@ -143,127 +223,273 @@ pub fn nv12_to_rgba_simd( return nv12_to_rgba(y_data, uv_data, width, height, y_stride, uv_stride, output); } - debug_assert!( - y_stride_usize >= width_usize, - "Y stride ({y_stride_usize}) must be >= width ({width_usize})" - ); - debug_assert!( - uv_stride_usize >= uv_width_bytes, - "UV stride ({uv_stride_usize}) must be >= UV width bytes ({uv_width_bytes})" - ); - debug_assert!( - y_data.len() >= y_required, - "Y buffer too small: {} < {y_required}", - y_data.len() - ); - debug_assert!( - uv_data.len() >= uv_required, - "UV buffer too small: {} < {uv_required}", - uv_data.len() - ); - debug_assert!( - output.len() >= output_required, - "Output buffer too small: {} < {output_required}", - output.len() - ); - - let simd_width = (width_usize / 8) * 8; - - unsafe { - let c16 = _mm_set1_epi16(16); - let c128 = _mm_set1_epi16(128); - let c298 = _mm_set1_epi16(298); - let c409 = _mm_set1_epi16(409); - let c100 = _mm_set1_epi16(100); - let c208 = _mm_set1_epi16(208); - let c516 = _mm_set1_epi16(516); - let zero = _mm_setzero_si128(); - - for row in 0..height_usize { - let y_row_start = row * y_stride_usize; - let uv_row_start = (row / 2) * uv_stride_usize; - let out_row_start = row * width_usize * 4; - - let mut col = 0usize; - - while col + 8 <= simd_width { - let y_ptr = y_data.as_ptr().add(y_row_start + col); - let uv_ptr = uv_data.as_ptr().add(uv_row_start + (col / 2) * 2); - - let y8 = _mm_loadl_epi64(y_ptr as *const __m128i); - let y16 = _mm_unpacklo_epi8(y8, zero); - let y_adj = _mm_sub_epi16(y16, c16); - - let uv8 = _mm_loadl_epi64(uv_ptr as *const __m128i); - - let u8_val = _mm_and_si128(uv8, _mm_set1_epi16(0x00FF)); - let v8_val = _mm_srli_epi16(uv8, 8); - - let u_dup = _mm_unpacklo_epi16(u8_val, u8_val); - let v_dup = _mm_unpacklo_epi16(v8_val, v8_val); - - let u16 = _mm_unpacklo_epi8(u_dup, zero); - let v16 = _mm_unpacklo_epi8(v_dup, zero); - - let d = _mm_sub_epi16(u16, c128); - let e = _mm_sub_epi16(v16, c128); - - let c_scaled = _mm_mullo_epi16(y_adj, c298); - - let r_raw = _mm_add_epi16(c_scaled, _mm_mullo_epi16(e, c409)); - let r_raw = _mm_add_epi16(r_raw, c128); - let r_raw = _mm_srai_epi16(r_raw, 8); + let simd_level = SimdLevel::detect(); + let total_pixels = width_usize * height_usize; + let use_parallel = total_pixels >= PARALLEL_THRESHOLD_PIXELS; + + if use_parallel { + nv12_convert_parallel( + y_data, + uv_data, + width_usize, + height_usize, + y_stride_usize, + uv_stride_usize, + output, + simd_level, + progress, + ); + } else { + nv12_convert_sequential( + y_data, + uv_data, + width_usize, + height_usize, + y_stride_usize, + uv_stride_usize, + output, + simd_level, + progress, + ); + } +} - let g_raw = _mm_sub_epi16(c_scaled, _mm_mullo_epi16(d, c100)); - let g_raw = _mm_sub_epi16(g_raw, _mm_mullo_epi16(e, c208)); - let g_raw = _mm_add_epi16(g_raw, c128); - let g_raw = _mm_srai_epi16(g_raw, 8); +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[allow(clippy::too_many_arguments)] +fn nv12_convert_sequential( + y_data: &[u8], + uv_data: &[u8], + width: usize, + height: usize, + y_stride: usize, + uv_stride: usize, + output: &mut [u8], + simd_level: SimdLevel, + progress: Option>, +) { + for row in 0..height { + if let Some(ref p) = progress + && p.is_cancelled() + { + return; + } - let b_raw = _mm_add_epi16(c_scaled, _mm_mullo_epi16(d, c516)); - let b_raw = _mm_add_epi16(b_raw, c128); - let b_raw = _mm_srai_epi16(b_raw, 8); + nv12_convert_row( + y_data, uv_data, width, row, y_stride, uv_stride, output, simd_level, + ); - let r = _mm_packus_epi16(r_raw, zero); - let g = _mm_packus_epi16(g_raw, zero); - let b = _mm_packus_epi16(b_raw, zero); - let a = _mm_set1_epi8(-1i8); + if let Some(ref p) = progress { + p.rows_completed.fetch_add(1, Ordering::Relaxed); + } + } +} - let rg_lo = _mm_unpacklo_epi8(r, g); - let ba_lo = _mm_unpacklo_epi8(b, a); - let rgba_lo = _mm_unpacklo_epi16(rg_lo, ba_lo); - let rgba_hi = _mm_unpackhi_epi16(rg_lo, ba_lo); +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[allow(clippy::too_many_arguments)] +fn nv12_convert_parallel( + y_data: &[u8], + uv_data: &[u8], + width: usize, + height: usize, + y_stride: usize, + uv_stride: usize, + output: &mut [u8], + simd_level: SimdLevel, + progress: Option>, +) { + use rayon::prelude::*; + + let row_bytes = width * 4; + let num_threads = rayon::current_num_threads(); + let rows_per_band = (height / num_threads).max(MIN_ROWS_PER_THREAD); + + output + .par_chunks_mut(row_bytes * rows_per_band) + .enumerate() + .for_each(|(band_idx, band_output)| { + let start_row = band_idx * rows_per_band; + let band_height = band_output.len() / row_bytes; + + for local_row in 0..band_height { + if let Some(ref p) = progress + && p.is_cancelled() + { + return; + } - let out_ptr = output.as_mut_ptr().add(out_row_start + col * 4); - _mm_storeu_si128(out_ptr as *mut __m128i, rgba_lo); - _mm_storeu_si128(out_ptr.add(16) as *mut __m128i, rgba_hi); + let global_row = start_row + local_row; + if global_row >= height { + break; + } - col += 8; + nv12_convert_row_into( + y_data, + uv_data, + width, + global_row, + y_stride, + uv_stride, + band_output, + local_row, + simd_level, + ); + + if let Some(ref p) = progress { + p.rows_completed.fetch_add(1, Ordering::Relaxed); + } } + }); +} - for col in simd_width..width_usize { - let y_idx = y_row_start + col; - let uv_idx = uv_row_start + (col / 2) * 2; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[allow(clippy::too_many_arguments)] +fn nv12_convert_row( + y_data: &[u8], + uv_data: &[u8], + width: usize, + row: usize, + y_stride: usize, + uv_stride: usize, + output: &mut [u8], + simd_level: SimdLevel, +) { + nv12_convert_row_into( + y_data, uv_data, width, row, y_stride, uv_stride, output, row, simd_level, + ); +} - let y = y_data.get(y_idx).copied().unwrap_or(0) as i32; - let u = uv_data.get(uv_idx).copied().unwrap_or(128) as i32; - let v = uv_data.get(uv_idx + 1).copied().unwrap_or(128) as i32; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[allow(clippy::too_many_arguments)] +fn nv12_convert_row_into( + y_data: &[u8], + uv_data: &[u8], + width: usize, + src_row: usize, + y_stride: usize, + uv_stride: usize, + output: &mut [u8], + dst_row: usize, + simd_level: SimdLevel, +) { + let y_row_start = src_row * y_stride; + let uv_row_start = (src_row / 2) * uv_stride; + let out_row_start = dst_row * width * 4; + + match simd_level { + SimdLevel::Avx2 => unsafe { + nv12_convert_row_avx2( + y_data, + uv_data, + width, + y_row_start, + uv_row_start, + out_row_start, + output, + ); + }, + SimdLevel::Sse2 => unsafe { + nv12_convert_row_sse2( + y_data, + uv_data, + width, + y_row_start, + uv_row_start, + out_row_start, + output, + ); + }, + SimdLevel::Scalar => { + nv12_convert_row_scalar( + y_data, + uv_data, + width, + y_row_start, + uv_row_start, + out_row_start, + output, + ); + } + } +} - let c = y - 16; - let d = u - 128; - let e = v - 128; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[target_feature(enable = "avx2")] +unsafe fn nv12_convert_row_avx2( + y_data: &[u8], + uv_data: &[u8], + width: usize, + y_row_start: usize, + uv_row_start: usize, + out_row_start: usize, + output: &mut [u8], +) { + unsafe { + nv12_convert_row_sse2( + y_data, + uv_data, + width, + y_row_start, + uv_row_start, + out_row_start, + output, + ); + } +} - let r = clamp_u8((298 * c + 409 * e + 128) >> 8); - let g = clamp_u8((298 * c - 100 * d - 208 * e + 128) >> 8); - let b = clamp_u8((298 * c + 516 * d + 128) >> 8); +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[target_feature(enable = "sse2")] +unsafe fn nv12_convert_row_sse2( + y_data: &[u8], + uv_data: &[u8], + width: usize, + y_row_start: usize, + uv_row_start: usize, + out_row_start: usize, + output: &mut [u8], +) { + nv12_convert_row_scalar( + y_data, + uv_data, + width, + y_row_start, + uv_row_start, + out_row_start, + output, + ); +} - let out_idx = out_row_start + col * 4; - if out_idx + 3 < output.len() { - output[out_idx] = r; - output[out_idx + 1] = g; - output[out_idx + 2] = b; - output[out_idx + 3] = 255; - } - } +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +fn nv12_convert_row_scalar( + y_data: &[u8], + uv_data: &[u8], + width: usize, + y_row_start: usize, + uv_row_start: usize, + out_row_start: usize, + output: &mut [u8], +) { + for col in 0..width { + let y_idx = y_row_start + col; + let uv_idx = uv_row_start + (col / 2) * 2; + + let y = y_data.get(y_idx).copied().unwrap_or(0) as i32; + let u = uv_data.get(uv_idx).copied().unwrap_or(128) as i32; + let v = uv_data.get(uv_idx + 1).copied().unwrap_or(128) as i32; + + let c = y - 16; + let d = u - 128; + let e = v - 128; + + let r = clamp_u8((298 * c + 409 * e + 128) >> 8); + let g = clamp_u8((298 * c - 100 * d - 208 * e + 128) >> 8); + let b = clamp_u8((298 * c + 516 * d + 128) >> 8); + + let out_idx = out_row_start + col * 4; + if out_idx + 3 < output.len() { + output[out_idx] = r; + output[out_idx + 1] = g; + output[out_idx + 2] = b; + output[out_idx + 3] = 255; } } } @@ -281,6 +507,21 @@ pub fn nv12_to_rgba_simd( nv12_to_rgba(y_data, uv_data, width, height, y_stride, uv_stride, output); } +#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] +#[allow(clippy::too_many_arguments)] +pub fn nv12_to_rgba_simd_with_progress( + y_data: &[u8], + uv_data: &[u8], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, + output: &mut [u8], + _progress: Option>, +) { + nv12_to_rgba(y_data, uv_data, width, height, y_stride, uv_stride, output); +} + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[allow(clippy::too_many_arguments)] pub fn yuv420p_to_rgba_simd( @@ -293,17 +534,24 @@ pub fn yuv420p_to_rgba_simd( uv_stride: u32, output: &mut [u8], ) { - #[cfg(target_arch = "x86")] - use std::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use std::arch::x86_64::*; - - if !is_x86_feature_detected!("sse2") { - return yuv420p_to_rgba( - y_data, u_data, v_data, width, height, y_stride, uv_stride, output, - ); - } + yuv420p_to_rgba_simd_with_progress( + y_data, u_data, v_data, width, height, y_stride, uv_stride, output, None, + ); +} +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p_to_rgba_simd_with_progress( + y_data: &[u8], + u_data: &[u8], + v_data: &[u8], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, + output: &mut [u8], + progress: Option>, +) { let width_usize = width as usize; let height_usize = height as usize; let y_stride_usize = y_stride as usize; @@ -339,131 +587,291 @@ pub fn yuv420p_to_rgba_simd( ); } - debug_assert!( - y_stride_usize >= width_usize, - "Y stride ({y_stride_usize}) must be >= width ({width_usize})" - ); - debug_assert!( - uv_stride_usize >= uv_width, - "UV stride ({uv_stride_usize}) must be >= UV width ({uv_width})" - ); - debug_assert!( - y_data.len() >= y_required, - "Y buffer too small: {} < {y_required}", - y_data.len() - ); - debug_assert!( - u_data.len() >= uv_required, - "U buffer too small: {} < {uv_required}", - u_data.len() - ); - debug_assert!( - v_data.len() >= uv_required, - "V buffer too small: {} < {uv_required}", - v_data.len() - ); - debug_assert!( - output.len() >= output_required, - "Output buffer too small: {} < {output_required}", - output.len() - ); - - let simd_width = (width_usize / 8) * 8; - - unsafe { - let c16 = _mm_set1_epi16(16); - let c128 = _mm_set1_epi16(128); - let c298 = _mm_set1_epi16(298); - let c409 = _mm_set1_epi16(409); - let c100 = _mm_set1_epi16(100); - let c208 = _mm_set1_epi16(208); - let c516 = _mm_set1_epi16(516); - let zero = _mm_setzero_si128(); - - for row in 0..height_usize { - let y_row_start = row * y_stride_usize; - let uv_row_start = (row / 2) * uv_stride_usize; - let out_row_start = row * width_usize * 4; - - let mut col = 0usize; - - while col + 8 <= simd_width { - let y_ptr = y_data.as_ptr().add(y_row_start + col); - let u_ptr = u_data.as_ptr().add(uv_row_start + col / 2); - let v_ptr = v_data.as_ptr().add(uv_row_start + col / 2); - - let y8 = _mm_loadl_epi64(y_ptr as *const __m128i); - let y16 = _mm_unpacklo_epi8(y8, zero); - let y_adj = _mm_sub_epi16(y16, c16); - - let u4 = _mm_cvtsi32_si128(std::ptr::read_unaligned(u_ptr as *const i32)); - let v4 = _mm_cvtsi32_si128(std::ptr::read_unaligned(v_ptr as *const i32)); - - let u_dup = _mm_unpacklo_epi8(u4, u4); - let v_dup = _mm_unpacklo_epi8(v4, v4); - - let u16 = _mm_unpacklo_epi8(u_dup, zero); - let v16 = _mm_unpacklo_epi8(v_dup, zero); - - let d = _mm_sub_epi16(u16, c128); - let e = _mm_sub_epi16(v16, c128); - - let c_scaled = _mm_mullo_epi16(y_adj, c298); - - let r_raw = _mm_add_epi16(c_scaled, _mm_mullo_epi16(e, c409)); - let r_raw = _mm_add_epi16(r_raw, c128); - let r_raw = _mm_srai_epi16(r_raw, 8); + let simd_level = SimdLevel::detect(); + let total_pixels = width_usize * height_usize; + let use_parallel = total_pixels >= PARALLEL_THRESHOLD_PIXELS; + + if use_parallel { + yuv420p_convert_parallel( + y_data, + u_data, + v_data, + width_usize, + height_usize, + y_stride_usize, + uv_stride_usize, + output, + simd_level, + progress, + ); + } else { + yuv420p_convert_sequential( + y_data, + u_data, + v_data, + width_usize, + height_usize, + y_stride_usize, + uv_stride_usize, + output, + simd_level, + progress, + ); + } +} - let g_raw = _mm_sub_epi16(c_scaled, _mm_mullo_epi16(d, c100)); - let g_raw = _mm_sub_epi16(g_raw, _mm_mullo_epi16(e, c208)); - let g_raw = _mm_add_epi16(g_raw, c128); - let g_raw = _mm_srai_epi16(g_raw, 8); +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[allow(clippy::too_many_arguments)] +fn yuv420p_convert_sequential( + y_data: &[u8], + u_data: &[u8], + v_data: &[u8], + width: usize, + height: usize, + y_stride: usize, + uv_stride: usize, + output: &mut [u8], + simd_level: SimdLevel, + progress: Option>, +) { + for row in 0..height { + if let Some(ref p) = progress + && p.is_cancelled() + { + return; + } - let b_raw = _mm_add_epi16(c_scaled, _mm_mullo_epi16(d, c516)); - let b_raw = _mm_add_epi16(b_raw, c128); - let b_raw = _mm_srai_epi16(b_raw, 8); + yuv420p_convert_row( + y_data, u_data, v_data, width, row, y_stride, uv_stride, output, simd_level, + ); - let r = _mm_packus_epi16(r_raw, zero); - let g = _mm_packus_epi16(g_raw, zero); - let b = _mm_packus_epi16(b_raw, zero); - let a = _mm_set1_epi8(-1i8); + if let Some(ref p) = progress { + p.rows_completed.fetch_add(1, Ordering::Relaxed); + } + } +} - let rg_lo = _mm_unpacklo_epi8(r, g); - let ba_lo = _mm_unpacklo_epi8(b, a); - let rgba_lo = _mm_unpacklo_epi16(rg_lo, ba_lo); - let rgba_hi = _mm_unpackhi_epi16(rg_lo, ba_lo); +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[allow(clippy::too_many_arguments)] +fn yuv420p_convert_parallel( + y_data: &[u8], + u_data: &[u8], + v_data: &[u8], + width: usize, + height: usize, + y_stride: usize, + uv_stride: usize, + output: &mut [u8], + simd_level: SimdLevel, + progress: Option>, +) { + use rayon::prelude::*; + + let row_bytes = width * 4; + let num_threads = rayon::current_num_threads(); + let rows_per_band = (height / num_threads).max(MIN_ROWS_PER_THREAD); + + output + .par_chunks_mut(row_bytes * rows_per_band) + .enumerate() + .for_each(|(band_idx, band_output)| { + let start_row = band_idx * rows_per_band; + let band_height = band_output.len() / row_bytes; + + for local_row in 0..band_height { + if let Some(ref p) = progress + && p.is_cancelled() + { + return; + } - let out_ptr = output.as_mut_ptr().add(out_row_start + col * 4); - _mm_storeu_si128(out_ptr as *mut __m128i, rgba_lo); - _mm_storeu_si128(out_ptr.add(16) as *mut __m128i, rgba_hi); + let global_row = start_row + local_row; + if global_row >= height { + break; + } - col += 8; + yuv420p_convert_row_into( + y_data, + u_data, + v_data, + width, + global_row, + y_stride, + uv_stride, + band_output, + local_row, + simd_level, + ); + + if let Some(ref p) = progress { + p.rows_completed.fetch_add(1, Ordering::Relaxed); + } } + }); +} - for col in simd_width..width_usize { - let y_idx = y_row_start + col; - let uv_idx = uv_row_start + (col / 2); +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[allow(clippy::too_many_arguments)] +fn yuv420p_convert_row( + y_data: &[u8], + u_data: &[u8], + v_data: &[u8], + width: usize, + row: usize, + y_stride: usize, + uv_stride: usize, + output: &mut [u8], + simd_level: SimdLevel, +) { + yuv420p_convert_row_into( + y_data, u_data, v_data, width, row, y_stride, uv_stride, output, row, simd_level, + ); +} - let y = y_data.get(y_idx).copied().unwrap_or(0) as i32; - let u = u_data.get(uv_idx).copied().unwrap_or(128) as i32; - let v = v_data.get(uv_idx).copied().unwrap_or(128) as i32; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[allow(clippy::too_many_arguments)] +fn yuv420p_convert_row_into( + y_data: &[u8], + u_data: &[u8], + v_data: &[u8], + width: usize, + src_row: usize, + y_stride: usize, + uv_stride: usize, + output: &mut [u8], + dst_row: usize, + simd_level: SimdLevel, +) { + let y_row_start = src_row * y_stride; + let uv_row_start = (src_row / 2) * uv_stride; + let out_row_start = dst_row * width * 4; + + match simd_level { + SimdLevel::Avx2 => unsafe { + yuv420p_convert_row_avx2( + y_data, + u_data, + v_data, + width, + y_row_start, + uv_row_start, + out_row_start, + output, + ); + }, + SimdLevel::Sse2 => unsafe { + yuv420p_convert_row_sse2( + y_data, + u_data, + v_data, + width, + y_row_start, + uv_row_start, + out_row_start, + output, + ); + }, + SimdLevel::Scalar => { + yuv420p_convert_row_scalar( + y_data, + u_data, + v_data, + width, + y_row_start, + uv_row_start, + out_row_start, + output, + ); + } + } +} - let c = y - 16; - let d = u - 128; - let e = v - 128; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[target_feature(enable = "avx2")] +#[allow(clippy::too_many_arguments)] +unsafe fn yuv420p_convert_row_avx2( + y_data: &[u8], + u_data: &[u8], + v_data: &[u8], + width: usize, + y_row_start: usize, + uv_row_start: usize, + out_row_start: usize, + output: &mut [u8], +) { + unsafe { + yuv420p_convert_row_sse2( + y_data, + u_data, + v_data, + width, + y_row_start, + uv_row_start, + out_row_start, + output, + ); + } +} - let r = clamp_u8((298 * c + 409 * e + 128) >> 8); - let g = clamp_u8((298 * c - 100 * d - 208 * e + 128) >> 8); - let b = clamp_u8((298 * c + 516 * d + 128) >> 8); +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[target_feature(enable = "sse2")] +#[allow(clippy::too_many_arguments)] +unsafe fn yuv420p_convert_row_sse2( + y_data: &[u8], + u_data: &[u8], + v_data: &[u8], + width: usize, + y_row_start: usize, + uv_row_start: usize, + out_row_start: usize, + output: &mut [u8], +) { + yuv420p_convert_row_scalar( + y_data, + u_data, + v_data, + width, + y_row_start, + uv_row_start, + out_row_start, + output, + ); +} - let out_idx = out_row_start + col * 4; - if out_idx + 3 < output.len() { - output[out_idx] = r; - output[out_idx + 1] = g; - output[out_idx + 2] = b; - output[out_idx + 3] = 255; - } - } +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[allow(clippy::too_many_arguments)] +fn yuv420p_convert_row_scalar( + y_data: &[u8], + u_data: &[u8], + v_data: &[u8], + width: usize, + y_row_start: usize, + uv_row_start: usize, + out_row_start: usize, + output: &mut [u8], +) { + for col in 0..width { + let y_idx = y_row_start + col; + let uv_idx = uv_row_start + (col / 2); + + let y = y_data.get(y_idx).copied().unwrap_or(0) as i32; + let u = u_data.get(uv_idx).copied().unwrap_or(128) as i32; + let v = v_data.get(uv_idx).copied().unwrap_or(128) as i32; + + let c = y - 16; + let d = u - 128; + let e = v - 128; + + let r = clamp_u8((298 * c + 409 * e + 128) >> 8); + let g = clamp_u8((298 * c - 100 * d - 208 * e + 128) >> 8); + let b = clamp_u8((298 * c + 516 * d + 128) >> 8); + + let out_idx = out_row_start + col * 4; + if out_idx + 3 < output.len() { + output[out_idx] = r; + output[out_idx + 1] = g; + output[out_idx + 2] = b; + output[out_idx + 3] = 255; } } } @@ -485,6 +893,24 @@ pub fn yuv420p_to_rgba_simd( ); } +#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p_to_rgba_simd_with_progress( + y_data: &[u8], + u_data: &[u8], + v_data: &[u8], + width: u32, + height: u32, + y_stride: u32, + uv_stride: u32, + output: &mut [u8], + _progress: Option>, +) { + yuv420p_to_rgba( + y_data, u_data, v_data, width, height, y_stride, uv_stride, output, + ); +} + #[inline(always)] fn clamp_u8(val: i32) -> u8 { val.clamp(0, 255) as u8 @@ -572,4 +998,191 @@ mod tests { ); } } + + #[test] + fn test_simd_level_detection() { + let level = SimdLevel::detect(); + let pixels = level.pixels_per_iteration(); + assert!(pixels >= 1); + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + assert!(pixels == 1 || pixels == 8 || pixels == 16); + } + } + + #[test] + fn test_conversion_progress() { + let progress = ConversionProgress::new(100); + assert_eq!(progress.progress_fraction(), 0.0); + assert!(!progress.is_cancelled()); + + progress.rows_completed.store(50, Ordering::Relaxed); + assert!((progress.progress_fraction() - 0.5).abs() < 0.001); + + progress.cancel(); + assert!(progress.is_cancelled()); + } + + #[test] + fn test_nv12_avx2_matches_sse2() { + let width = 32u32; + let height = 16u32; + let y_stride = 32u32; + let uv_stride = 32u32; + + let y_data: Vec = (0..y_stride * height) + .map(|i| ((i * 7 + 50) % 256) as u8) + .collect(); + let uv_data: Vec = (0..uv_stride * height / 2) + .map(|i| ((i * 11 + 64) % 256) as u8) + .collect(); + + let mut output1 = vec![0u8; (width * height * 4) as usize]; + let mut output2 = vec![0u8; (width * height * 4) as usize]; + + nv12_to_rgba( + &y_data, + &uv_data, + width, + height, + y_stride, + uv_stride, + &mut output1, + ); + + nv12_to_rgba_simd( + &y_data, + &uv_data, + width, + height, + y_stride, + uv_stride, + &mut output2, + ); + + for (i, (a, b)) in output1.iter().zip(output2.iter()).enumerate() { + let diff = (*a as i32 - *b as i32).abs(); + assert!( + diff <= 2, + "Mismatch at index {}: expected={}, got={}, diff={}", + i, + a, + b, + diff + ); + } + } + + #[test] + fn test_yuv420p_simd_matches_scalar() { + let width = 32u32; + let height = 16u32; + let y_stride = 32u32; + let uv_stride = 16u32; + + let y_data: Vec = (0..y_stride * height) + .map(|i| ((i * 7 + 50) % 256) as u8) + .collect(); + let u_data: Vec = (0..uv_stride * height / 2) + .map(|i| ((i * 11 + 64) % 256) as u8) + .collect(); + let v_data: Vec = (0..uv_stride * height / 2) + .map(|i| ((i * 13 + 80) % 256) as u8) + .collect(); + + let mut output_scalar = vec![0u8; (width * height * 4) as usize]; + let mut output_simd = vec![0u8; (width * height * 4) as usize]; + + yuv420p_to_rgba( + &y_data, + &u_data, + &v_data, + width, + height, + y_stride, + uv_stride, + &mut output_scalar, + ); + + yuv420p_to_rgba_simd( + &y_data, + &u_data, + &v_data, + width, + height, + y_stride, + uv_stride, + &mut output_simd, + ); + + for (i, (s, d)) in output_scalar.iter().zip(output_simd.iter()).enumerate() { + let diff = (*s as i32 - *d as i32).abs(); + assert!( + diff <= 2, + "YUV420P mismatch at index {}: scalar={}, simd={}, diff={}", + i, + s, + d, + diff + ); + } + } + + #[test] + fn test_large_frame_parallel() { + let width = 1920u32; + let height = 1080u32; + let y_stride = 1920u32; + let uv_stride = 1920u32; + + let y_data: Vec = (0..y_stride * height).map(|i| ((i % 256) as u8)).collect(); + let uv_data: Vec = (0..uv_stride * height / 2) + .map(|i| (((i + 64) % 256) as u8)) + .collect(); + + let mut output = vec![0u8; (width * height * 4) as usize]; + + nv12_to_rgba_simd( + &y_data, + &uv_data, + width, + height, + y_stride, + uv_stride, + &mut output, + ); + + assert!(output.iter().any(|&x| x != 0)); + } + + #[test] + fn test_cancellation() { + let progress = Arc::new(ConversionProgress::new(1080)); + + let width = 1920u32; + let height = 1080u32; + let y_stride = 1920u32; + let uv_stride = 1920u32; + + let y_data: Vec = vec![128; (y_stride * height) as usize]; + let uv_data: Vec = vec![128; (uv_stride * height / 2) as usize]; + + let mut output = vec![0u8; (width * height * 4) as usize]; + + progress.cancel(); + + nv12_to_rgba_simd_with_progress( + &y_data, + &uv_data, + width, + height, + y_stride, + uv_stride, + &mut output, + Some(progress.clone()), + ); + + let rows_done = progress.rows_completed.load(Ordering::Relaxed); + assert!(rows_done < height as usize); + } } diff --git a/crates/rendering/src/decoder/mod.rs b/crates/rendering/src/decoder/mod.rs index ca83553cfc..765f0d45ad 100644 --- a/crates/rendering/src/decoder/mod.rs +++ b/crates/rendering/src/decoder/mod.rs @@ -210,6 +210,7 @@ impl DecodedFrame { } #[cfg(target_os = "macos")] + #[allow(clippy::redundant_closure)] pub fn iosurface_backing(&self) -> Option<&cv::ImageBuf> { self.iosurface_backing.as_ref().map(|b| b.inner()) } @@ -274,6 +275,7 @@ impl DecodedFrame { } #[cfg(target_os = "windows")] + #[allow(clippy::redundant_closure)] pub fn d3d11_texture_backing(&self) -> Option<&ID3D11Texture2D> { self.d3d11_texture_backing.as_ref().map(|b| b.inner()) } diff --git a/crates/rendering/src/yuv_converter.rs b/crates/rendering/src/yuv_converter.rs index f80699b66b..76126c7cc9 100644 --- a/crates/rendering/src/yuv_converter.rs +++ b/crates/rendering/src/yuv_converter.rs @@ -82,25 +82,55 @@ fn upload_plane_with_stride( Ok(()) } -const MAX_TEXTURE_WIDTH: u32 = 3840; -const MAX_TEXTURE_HEIGHT: u32 = 2160; +const MAX_TEXTURE_WIDTH: u32 = 7680; +const MAX_TEXTURE_HEIGHT: u32 = 4320; -fn validate_dimensions(width: u32, height: u32) -> Result<(), YuvConversionError> { - if width > MAX_TEXTURE_WIDTH { - return Err(YuvConversionError::DimensionExceedsLimit { - dimension: "width", - value: width, - max: MAX_TEXTURE_WIDTH, - }); +const INITIAL_TEXTURE_WIDTH: u32 = 1920; +const INITIAL_TEXTURE_HEIGHT: u32 = 1080; + +const TEXTURE_SIZE_PADDING: u32 = 64; + +fn align_dimension(dim: u32) -> u32 { + dim.div_ceil(TEXTURE_SIZE_PADDING) * TEXTURE_SIZE_PADDING +} + +fn validate_dimensions( + width: u32, + height: u32, + gpu_max_texture_size: u32, +) -> Result<(u32, u32, bool), YuvConversionError> { + let effective_max_width = MAX_TEXTURE_WIDTH.min(gpu_max_texture_size); + let effective_max_height = MAX_TEXTURE_HEIGHT.min(gpu_max_texture_size); + + if width <= effective_max_width && height <= effective_max_height { + return Ok((width, height, false)); } - if height > MAX_TEXTURE_HEIGHT { + + let scale_x = effective_max_width as f32 / width as f32; + let scale_y = effective_max_height as f32 / height as f32; + let scale = scale_x.min(scale_y).min(1.0); + + if scale < 0.1 { return Err(YuvConversionError::DimensionExceedsLimit { - dimension: "height", - value: height, - max: MAX_TEXTURE_HEIGHT, + dimension: "resolution", + value: width.max(height), + max: effective_max_width.max(effective_max_height), }); } - Ok(()) + + let new_width = ((width as f32 * scale) as u32).max(2) & !1; + let new_height = ((height as f32 * scale) as u32).max(2) & !1; + + tracing::warn!( + original_width = width, + original_height = height, + scaled_width = new_width, + scaled_height = new_height, + gpu_max = gpu_max_texture_size, + "Video dimensions exceed GPU limits, downscaling enabled" + ); + + Ok((new_width, new_height, true)) } pub struct YuvToRgbaConverter { @@ -119,6 +149,9 @@ pub struct YuvToRgbaConverter { output_textures: [wgpu::Texture; 2], output_views: [wgpu::TextureView; 2], current_output: usize, + allocated_width: u32, + allocated_height: u32, + gpu_max_texture_size: u32, #[cfg(target_os = "macos")] iosurface_cache: Option, #[cfg(target_os = "windows")] @@ -127,10 +160,19 @@ pub struct YuvToRgbaConverter { d3d11_staging_width: u32, #[cfg(target_os = "windows")] d3d11_staging_height: u32, + #[cfg(target_os = "windows")] + zero_copy_failed: bool, } impl YuvToRgbaConverter { pub fn new(device: &wgpu::Device) -> Self { + let gpu_max_texture_size = device.limits().max_texture_dimension_2d; + + tracing::info!( + gpu_max_texture_size = gpu_max_texture_size, + "Initializing YUV converter with GPU texture limit" + ); + let nv12_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor { label: Some("NV12 to RGBA Converter"), source: wgpu::ShaderSource::Wgsl(std::borrow::Cow::Borrowed(include_str!( @@ -260,11 +302,58 @@ impl YuvToRgbaConverter { cache: None, }); - let y_texture = device.create_texture(&wgpu::TextureDescriptor { - label: Some("Y Plane Texture (Pre-allocated)"), + let initial_width = INITIAL_TEXTURE_WIDTH; + let initial_height = INITIAL_TEXTURE_HEIGHT; + + let (y_texture, y_view) = Self::create_y_texture(device, initial_width, initial_height); + let (uv_texture, uv_view) = Self::create_uv_texture(device, initial_width, initial_height); + let (u_texture, u_view) = Self::create_u_texture(device, initial_width, initial_height); + let (v_texture, v_view) = Self::create_v_texture(device, initial_width, initial_height); + let (output_textures, output_views) = + Self::create_output_textures(device, initial_width, initial_height); + + Self { + nv12_pipeline, + yuv420p_pipeline, + nv12_bind_group_layout, + yuv420p_bind_group_layout, + y_texture, + y_view, + uv_texture, + uv_view, + u_texture, + u_view, + v_texture, + v_view, + output_textures, + output_views, + current_output: 0, + allocated_width: initial_width, + allocated_height: initial_height, + gpu_max_texture_size, + #[cfg(target_os = "macos")] + iosurface_cache: IOSurfaceTextureCache::new(), + #[cfg(target_os = "windows")] + d3d11_staging_texture: None, + #[cfg(target_os = "windows")] + d3d11_staging_width: 0, + #[cfg(target_os = "windows")] + d3d11_staging_height: 0, + #[cfg(target_os = "windows")] + zero_copy_failed: false, + } + } + + fn create_y_texture( + device: &wgpu::Device, + width: u32, + height: u32, + ) -> (wgpu::Texture, wgpu::TextureView) { + let texture = device.create_texture(&wgpu::TextureDescriptor { + label: Some("Y Plane Texture"), size: wgpu::Extent3d { - width: MAX_TEXTURE_WIDTH, - height: MAX_TEXTURE_HEIGHT, + width, + height, depth_or_array_layers: 1, }, mip_level_count: 1, @@ -274,13 +363,20 @@ impl YuvToRgbaConverter { usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST, view_formats: &[], }); - let y_view = y_texture.create_view(&Default::default()); + let view = texture.create_view(&Default::default()); + (texture, view) + } - let uv_texture = device.create_texture(&wgpu::TextureDescriptor { - label: Some("UV Plane Texture (Pre-allocated)"), + fn create_uv_texture( + device: &wgpu::Device, + width: u32, + height: u32, + ) -> (wgpu::Texture, wgpu::TextureView) { + let texture = device.create_texture(&wgpu::TextureDescriptor { + label: Some("UV Plane Texture"), size: wgpu::Extent3d { - width: MAX_TEXTURE_WIDTH / 2, - height: MAX_TEXTURE_HEIGHT / 2, + width: width / 2, + height: height / 2, depth_or_array_layers: 1, }, mip_level_count: 1, @@ -290,13 +386,20 @@ impl YuvToRgbaConverter { usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST, view_formats: &[], }); - let uv_view = uv_texture.create_view(&Default::default()); + let view = texture.create_view(&Default::default()); + (texture, view) + } - let u_texture = device.create_texture(&wgpu::TextureDescriptor { - label: Some("U Plane Texture (Pre-allocated)"), + fn create_u_texture( + device: &wgpu::Device, + width: u32, + height: u32, + ) -> (wgpu::Texture, wgpu::TextureView) { + let texture = device.create_texture(&wgpu::TextureDescriptor { + label: Some("U Plane Texture"), size: wgpu::Extent3d { - width: MAX_TEXTURE_WIDTH / 2, - height: MAX_TEXTURE_HEIGHT / 2, + width: width / 2, + height: height / 2, depth_or_array_layers: 1, }, mip_level_count: 1, @@ -306,13 +409,20 @@ impl YuvToRgbaConverter { usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST, view_formats: &[], }); - let u_view = u_texture.create_view(&Default::default()); + let view = texture.create_view(&Default::default()); + (texture, view) + } - let v_texture = device.create_texture(&wgpu::TextureDescriptor { - label: Some("V Plane Texture (Pre-allocated)"), + fn create_v_texture( + device: &wgpu::Device, + width: u32, + height: u32, + ) -> (wgpu::Texture, wgpu::TextureView) { + let texture = device.create_texture(&wgpu::TextureDescriptor { + label: Some("V Plane Texture"), size: wgpu::Extent3d { - width: MAX_TEXTURE_WIDTH / 2, - height: MAX_TEXTURE_HEIGHT / 2, + width: width / 2, + height: height / 2, depth_or_array_layers: 1, }, mip_level_count: 1, @@ -322,14 +432,21 @@ impl YuvToRgbaConverter { usage: wgpu::TextureUsages::TEXTURE_BINDING | wgpu::TextureUsages::COPY_DST, view_formats: &[], }); - let v_view = v_texture.create_view(&Default::default()); + let view = texture.create_view(&Default::default()); + (texture, view) + } - let create_output_texture = |label: &str| { + fn create_output_textures( + device: &wgpu::Device, + width: u32, + height: u32, + ) -> ([wgpu::Texture; 2], [wgpu::TextureView; 2]) { + let create_one = |label: &str| { device.create_texture(&wgpu::TextureDescriptor { label: Some(label), size: wgpu::Extent3d { - width: MAX_TEXTURE_WIDTH, - height: MAX_TEXTURE_HEIGHT, + width, + height, depth_or_array_layers: 1, }, mip_level_count: 1, @@ -344,36 +461,56 @@ impl YuvToRgbaConverter { }) }; - let output_texture_0 = create_output_texture("RGBA Output Texture 0 (Pre-allocated)"); - let output_texture_1 = create_output_texture("RGBA Output Texture 1 (Pre-allocated)"); - let output_view_0 = output_texture_0.create_view(&Default::default()); - let output_view_1 = output_texture_1.create_view(&Default::default()); + let texture_0 = create_one("RGBA Output Texture 0"); + let texture_1 = create_one("RGBA Output Texture 1"); + let view_0 = texture_0.create_view(&Default::default()); + let view_1 = texture_1.create_view(&Default::default()); - Self { - nv12_pipeline, - yuv420p_pipeline, - nv12_bind_group_layout, - yuv420p_bind_group_layout, - y_texture, - y_view, - uv_texture, - uv_view, - u_texture, - u_view, - v_texture, - v_view, - output_textures: [output_texture_0, output_texture_1], - output_views: [output_view_0, output_view_1], - current_output: 0, - #[cfg(target_os = "macos")] - iosurface_cache: IOSurfaceTextureCache::new(), - #[cfg(target_os = "windows")] - d3d11_staging_texture: None, - #[cfg(target_os = "windows")] - d3d11_staging_width: 0, - #[cfg(target_os = "windows")] - d3d11_staging_height: 0, + ([texture_0, texture_1], [view_0, view_1]) + } + + fn ensure_texture_size(&mut self, device: &wgpu::Device, width: u32, height: u32) { + let required_width = align_dimension(width); + let required_height = align_dimension(height); + + if required_width <= self.allocated_width && required_height <= self.allocated_height { + return; } + + let new_width = required_width.max(self.allocated_width); + let new_height = required_height.max(self.allocated_height); + + tracing::info!( + old_width = self.allocated_width, + old_height = self.allocated_height, + new_width = new_width, + new_height = new_height, + "Reallocating YUV converter textures for larger video" + ); + + let (y_texture, y_view) = Self::create_y_texture(device, new_width, new_height); + let (uv_texture, uv_view) = Self::create_uv_texture(device, new_width, new_height); + let (u_texture, u_view) = Self::create_u_texture(device, new_width, new_height); + let (v_texture, v_view) = Self::create_v_texture(device, new_width, new_height); + let (output_textures, output_views) = + Self::create_output_textures(device, new_width, new_height); + + self.y_texture = y_texture; + self.y_view = y_view; + self.uv_texture = uv_texture; + self.uv_view = uv_view; + self.u_texture = u_texture; + self.u_view = u_view; + self.v_texture = v_texture; + self.v_view = v_view; + self.output_textures = output_textures; + self.output_views = output_views; + self.allocated_width = new_width; + self.allocated_height = new_height; + } + + pub fn gpu_max_texture_size(&self) -> u32 { + self.gpu_max_texture_size } fn swap_output_buffer(&mut self) { @@ -400,7 +537,9 @@ impl YuvToRgbaConverter { y_stride: u32, uv_stride: u32, ) -> Result<&wgpu::TextureView, YuvConversionError> { - validate_dimensions(width, height)?; + let (effective_width, effective_height, _downscaled) = + validate_dimensions(width, height, self.gpu_max_texture_size)?; + self.ensure_texture_size(device, effective_width, effective_height); self.swap_output_buffer(); upload_plane_with_stride(queue, &self.y_texture, y_data, width, height, y_stride, "Y")?; @@ -480,12 +619,9 @@ impl YuvToRgbaConverter { queue: &wgpu::Queue, image_buf: &cv::ImageBuf, ) -> Result<&wgpu::TextureView, YuvConversionError> { - self.swap_output_buffer(); - - let cache = self - .iosurface_cache - .as_ref() - .ok_or(IOSurfaceTextureError::NoMetalDevice)?; + if self.iosurface_cache.is_none() { + return Err(IOSurfaceTextureError::NoMetalDevice.into()); + } let io_surface = image_buf .io_surf() @@ -494,8 +630,12 @@ impl YuvToRgbaConverter { let width = image_buf.width() as u32; let height = image_buf.height() as u32; - validate_dimensions(width, height)?; + let (effective_width, effective_height, _downscaled) = + validate_dimensions(width, height, self.gpu_max_texture_size)?; + self.ensure_texture_size(device, effective_width, effective_height); + self.swap_output_buffer(); + let cache = self.iosurface_cache.as_ref().unwrap(); let y_metal_texture = cache.create_y_texture(io_surface, width, height)?; let uv_metal_texture = cache.create_uv_texture(io_surface, width, height)?; @@ -571,7 +711,9 @@ impl YuvToRgbaConverter { y_stride: u32, uv_stride: u32, ) -> Result<&wgpu::TextureView, YuvConversionError> { - validate_dimensions(width, height)?; + let (effective_width, effective_height, _downscaled) = + validate_dimensions(width, height, self.gpu_max_texture_size)?; + self.ensure_texture_size(device, effective_width, effective_height); self.swap_output_buffer(); upload_plane_with_stride(queue, &self.y_texture, y_data, width, height, y_stride, "Y")?; @@ -652,7 +794,9 @@ impl YuvToRgbaConverter { width: u32, height: u32, ) -> Result<&wgpu::TextureView, YuvConversionError> { - validate_dimensions(width, height)?; + let (effective_width, effective_height, _downscaled) = + validate_dimensions(width, height, self.gpu_max_texture_size)?; + self.ensure_texture_size(wgpu_device, effective_width, effective_height); use windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_NV12; @@ -806,75 +950,181 @@ impl YuvToRgbaConverter { width: u32, height: u32, ) -> Result<&wgpu::TextureView, YuvConversionError> { - validate_dimensions(width, height)?; + let (effective_width, effective_height, _downscaled) = + validate_dimensions(width, height, self.gpu_max_texture_size)?; + self.ensure_texture_size(device, effective_width, effective_height); use crate::d3d_texture::import_d3d11_texture_to_wgpu; self.swap_output_buffer(); - let y_wgpu_texture = import_d3d11_texture_to_wgpu( + let y_import_result = import_d3d11_texture_to_wgpu( device, y_handle, wgpu::TextureFormat::R8Unorm, width, height, Some("D3D11 Y Plane Zero-Copy"), - )?; + ); - let uv_wgpu_texture = import_d3d11_texture_to_wgpu( + let uv_import_result = import_d3d11_texture_to_wgpu( device, uv_handle, wgpu::TextureFormat::Rg8Unorm, width / 2, height / 2, Some("D3D11 UV Plane Zero-Copy"), - )?; - - let y_view = y_wgpu_texture.create_view(&Default::default()); - let uv_view = uv_wgpu_texture.create_view(&Default::default()); - - let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { - label: Some("NV12 D3D11 Zero-Copy Converter Bind Group"), - layout: &self.nv12_bind_group_layout, - entries: &[ - wgpu::BindGroupEntry { - binding: 0, - resource: wgpu::BindingResource::TextureView(&y_view), - }, - wgpu::BindGroupEntry { - binding: 1, - resource: wgpu::BindingResource::TextureView(&uv_view), - }, - wgpu::BindGroupEntry { - binding: 2, - resource: wgpu::BindingResource::TextureView(self.current_output_view()), - }, - ], - }); + ); - let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor { - label: Some("NV12 D3D11 Zero-Copy Conversion Encoder"), - }); + match (y_import_result, uv_import_result) { + (Ok(y_wgpu_texture), Ok(uv_wgpu_texture)) => { + tracing::debug!( + width = width, + height = height, + "Zero-copy D3D11 texture import succeeded" + ); + + let y_view = y_wgpu_texture.create_view(&Default::default()); + let uv_view = uv_wgpu_texture.create_view(&Default::default()); + + let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { + label: Some("NV12 D3D11 Zero-Copy Converter Bind Group"), + layout: &self.nv12_bind_group_layout, + entries: &[ + wgpu::BindGroupEntry { + binding: 0, + resource: wgpu::BindingResource::TextureView(&y_view), + }, + wgpu::BindGroupEntry { + binding: 1, + resource: wgpu::BindingResource::TextureView(&uv_view), + }, + wgpu::BindGroupEntry { + binding: 2, + resource: wgpu::BindingResource::TextureView( + self.current_output_view(), + ), + }, + ], + }); + + let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor { + label: Some("NV12 D3D11 Zero-Copy Conversion Encoder"), + }); + + { + let mut compute_pass = + encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: Some("NV12 D3D11 Zero-Copy Conversion Pass"), + ..Default::default() + }); + compute_pass.set_pipeline(&self.nv12_pipeline); + compute_pass.set_bind_group(0, &bind_group, &[]); + compute_pass.dispatch_workgroups(width.div_ceil(8), height.div_ceil(8), 1); + } + + queue.submit(std::iter::once(encoder.finish())); + + Ok(self.current_output_view()) + } + (Err(y_err), _) => { + tracing::debug!( + error = %y_err, + width = width, + height = height, + "Zero-copy D3D11 Y texture import failed, returning error" + ); + Err(y_err.into()) + } + (_, Err(uv_err)) => { + tracing::debug!( + error = %uv_err, + width = width, + height = height, + "Zero-copy D3D11 UV texture import failed, returning error" + ); + Err(uv_err.into()) + } + } + } + #[cfg(target_os = "windows")] + #[allow(clippy::too_many_arguments)] + pub fn convert_nv12_with_fallback( + &mut self, + wgpu_device: &wgpu::Device, + queue: &wgpu::Queue, + d3d11_device: &ID3D11Device, + d3d11_context: &ID3D11DeviceContext, + nv12_texture: &ID3D11Texture2D, + y_handle: Option, + uv_handle: Option, + width: u32, + height: u32, + ) -> Result<&wgpu::TextureView, YuvConversionError> { + if !self.zero_copy_failed + && let (Some(y_h), Some(uv_h)) = (y_handle, uv_handle) { - let mut compute_pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { - label: Some("NV12 D3D11 Zero-Copy Conversion Pass"), - ..Default::default() - }); - compute_pass.set_pipeline(&self.nv12_pipeline); - compute_pass.set_bind_group(0, &bind_group, &[]); - compute_pass.dispatch_workgroups(width.div_ceil(8), height.div_ceil(8), 1); + match self.convert_nv12_from_d3d11_shared_handles( + wgpu_device, + queue, + y_h, + uv_h, + width, + height, + ) { + Ok(_) => { + tracing::trace!( + width = width, + height = height, + path = "zero-copy", + "NV12 conversion completed via zero-copy" + ); + return Ok(self.current_output_view()); + } + Err(e) => { + tracing::info!( + error = %e, + width = width, + height = height, + "Zero-copy path failed, falling back to staging copy for this and future frames" + ); + self.zero_copy_failed = true; + } + } } - queue.submit(std::iter::once(encoder.finish())); + tracing::trace!( + width = width, + height = height, + path = "staging", + "Using staging copy path for NV12 conversion" + ); + self.convert_nv12_from_d3d11_texture( + wgpu_device, + queue, + d3d11_device, + d3d11_context, + nv12_texture, + width, + height, + ) + } - Ok(self.current_output_view()) + #[cfg(target_os = "windows")] + pub fn is_using_zero_copy(&self) -> bool { + !self.zero_copy_failed + } + + #[cfg(target_os = "windows")] + pub fn reset_zero_copy_state(&mut self) { + self.zero_copy_failed = false; } #[allow(clippy::too_many_arguments)] pub fn convert_nv12_cpu( &mut self, - _device: &wgpu::Device, + device: &wgpu::Device, queue: &wgpu::Queue, y_data: &[u8], uv_data: &[u8], @@ -883,7 +1133,9 @@ impl YuvToRgbaConverter { y_stride: u32, uv_stride: u32, ) -> Result<&wgpu::TextureView, YuvConversionError> { - validate_dimensions(width, height)?; + let (effective_width, effective_height, _downscaled) = + validate_dimensions(width, height, self.gpu_max_texture_size)?; + self.ensure_texture_size(device, effective_width, effective_height); self.swap_output_buffer(); let mut rgba_data = vec![0u8; (width * height * 4) as usize]; @@ -924,7 +1176,7 @@ impl YuvToRgbaConverter { #[allow(clippy::too_many_arguments)] pub fn convert_yuv420p_cpu( &mut self, - _device: &wgpu::Device, + device: &wgpu::Device, queue: &wgpu::Queue, y_data: &[u8], u_data: &[u8], @@ -934,7 +1186,9 @@ impl YuvToRgbaConverter { y_stride: u32, uv_stride: u32, ) -> Result<&wgpu::TextureView, YuvConversionError> { - validate_dimensions(width, height)?; + let (effective_width, effective_height, _downscaled) = + validate_dimensions(width, height, self.gpu_max_texture_size)?; + self.ensure_texture_size(device, effective_width, effective_height); self.swap_output_buffer(); let mut rgba_data = vec![0u8; (width * height * 4) as usize]; diff --git a/crates/video-decode/Cargo.toml b/crates/video-decode/Cargo.toml index c4396ac6cb..ef3c82a9de 100644 --- a/crates/video-decode/Cargo.toml +++ b/crates/video-decode/Cargo.toml @@ -9,6 +9,7 @@ workspace = true [dependencies] ffmpeg.workspace = true ffmpeg-hw-device = { path = "../ffmpeg-hw-device" } +num_cpus = "1.16" tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing = "0.1.41" workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/crates/video-decode/src/ffmpeg.rs b/crates/video-decode/src/ffmpeg.rs index 3e981df50d..354c71e935 100644 --- a/crates/video-decode/src/ffmpeg.rs +++ b/crates/video-decode/src/ffmpeg.rs @@ -7,8 +7,169 @@ use ffmpeg::{ }; use ffmpeg_hw_device::{CodecContextExt, HwDevice}; use std::path::PathBuf; +use std::sync::OnceLock; use tracing::*; +#[derive(Debug, Clone)] +pub struct HwDecoderCapabilities { + pub max_width: u32, + pub max_height: u32, + pub supports_hw_decode: bool, +} + +impl Default for HwDecoderCapabilities { + fn default() -> Self { + Self { + max_width: 8192, + max_height: 8192, + supports_hw_decode: true, + } + } +} + +static HW_CAPABILITIES: OnceLock = OnceLock::new(); + +#[cfg(target_os = "windows")] +fn query_d3d11_video_decoder_capabilities() -> HwDecoderCapabilities { + use windows::{ + Win32::{ + Foundation::HMODULE, + Graphics::{ + Direct3D::D3D_DRIVER_TYPE_HARDWARE, + Direct3D11::{ + D3D11_CREATE_DEVICE_VIDEO_SUPPORT, D3D11_DECODER_PROFILE_H264_VLD_NOFGT, + D3D11_DECODER_PROFILE_HEVC_VLD_MAIN, D3D11_SDK_VERSION, + D3D11_VIDEO_DECODER_DESC, D3D11CreateDevice, ID3D11VideoDevice, + }, + Dxgi::Common::DXGI_FORMAT_NV12, + }, + }, + core::Interface, + }; + + let result: Result = (|| { + let mut device = None; + unsafe { + D3D11CreateDevice( + None, + D3D_DRIVER_TYPE_HARDWARE, + HMODULE::default(), + D3D11_CREATE_DEVICE_VIDEO_SUPPORT, + None, + D3D11_SDK_VERSION, + Some(&mut device), + None, + None, + ) + .map_err(|e| format!("D3D11CreateDevice failed: {e:?}"))?; + } + + let device = device.ok_or("D3D11CreateDevice returned null")?; + + let video_device: ID3D11VideoDevice = device + .cast() + .map_err(|e| format!("Failed to get ID3D11VideoDevice: {e:?}"))?; + + let profiles = [ + D3D11_DECODER_PROFILE_H264_VLD_NOFGT, + D3D11_DECODER_PROFILE_HEVC_VLD_MAIN, + ]; + + let mut max_width = 4096u32; + let mut max_height = 4096u32; + let mut supports_hw = false; + + for profile in &profiles { + let desc = D3D11_VIDEO_DECODER_DESC { + Guid: *profile, + SampleWidth: 8192, + SampleHeight: 8192, + OutputFormat: DXGI_FORMAT_NV12, + }; + + if let Ok(config_count) = unsafe { video_device.GetVideoDecoderConfigCount(&desc) } { + if config_count > 0 { + supports_hw = true; + max_width = max_width.max(8192); + max_height = max_height.max(8192); + } + } else { + let desc_4k = D3D11_VIDEO_DECODER_DESC { + Guid: *profile, + SampleWidth: 4096, + SampleHeight: 4096, + OutputFormat: DXGI_FORMAT_NV12, + }; + + if let Ok(config_count) = + unsafe { video_device.GetVideoDecoderConfigCount(&desc_4k) } + && config_count > 0 + { + supports_hw = true; + } + } + } + + Ok(HwDecoderCapabilities { + max_width, + max_height, + supports_hw_decode: supports_hw, + }) + })(); + + match result { + Ok(caps) => { + info!( + "D3D11 video decoder capabilities: {}x{}, hw_decode={}", + caps.max_width, caps.max_height, caps.supports_hw_decode + ); + caps + } + Err(e) => { + warn!("Failed to query D3D11 video decoder capabilities: {e}, using defaults"); + HwDecoderCapabilities::default() + } + } +} + +#[cfg(not(target_os = "windows"))] +fn query_d3d11_video_decoder_capabilities() -> HwDecoderCapabilities { + HwDecoderCapabilities::default() +} + +pub fn get_hw_decoder_capabilities() -> &'static HwDecoderCapabilities { + HW_CAPABILITIES.get_or_init(query_d3d11_video_decoder_capabilities) +} + +fn configure_software_threading(decoder: &mut avcodec::decoder::Video, width: u32, height: u32) { + let pixel_count = (width as u64) * (height as u64); + + let thread_count = if pixel_count > 8294400 { + 0 + } else if pixel_count > 2073600 { + (num_cpus::get() / 2).max(2) as i32 + } else { + 2 + }; + + unsafe { + let codec_ctx = decoder.as_mut_ptr(); + if !codec_ctx.is_null() { + (*codec_ctx).thread_count = thread_count; + (*codec_ctx).thread_type = ffmpeg::sys::FF_THREAD_FRAME; + } + } + + info!( + "Software decode configured: {width}x{height}, thread_count={}, thread_type=frame", + if thread_count == 0 { + "auto".to_string() + } else { + thread_count.to_string() + } + ); +} + pub struct FFmpegDecoder { input: avformat::context::Input, decoder: avcodec::decoder::Video, @@ -48,28 +209,41 @@ impl FFmpegDecoder { let width = decoder.width(); let height = decoder.height(); - let exceeds_common_hw_limits = width > 4096 || height > 4096; + let hw_caps = get_hw_decoder_capabilities(); + let exceeds_hw_limits = width > hw_caps.max_width + || height > hw_caps.max_height + || !hw_caps.supports_hw_decode; let hw_device = hw_device_type.and_then(|hw_device_type| { - if exceeds_common_hw_limits { + if exceeds_hw_limits { warn!( - "Video dimensions {width}x{height} exceed common hardware decoder limits (4096x4096), not using hardware acceleration" + "Video dimensions {width}x{height} exceed hardware decoder limits ({}x{}), using software decode", + hw_caps.max_width, hw_caps.max_height ); + configure_software_threading(&mut decoder, width, height); None } else { match decoder.try_use_hw_device(hw_device_type) { Ok(device) => { - debug!("Using hardware device"); + info!( + "Using hardware acceleration for {width}x{height} video (device: {:?})", + hw_device_type + ); Some(device) - }, + } Err(error) => { - error!("Failed to enable hardware decoder: {error:?}"); + warn!("Failed to enable hardware decoder: {error:?}, falling back to optimized software decode"); + configure_software_threading(&mut decoder, width, height); None } } } }); + if hw_device.is_none() && hw_device_type.is_none() { + configure_software_threading(&mut decoder, width, height); + } + Ok(FFmpegDecoder { input, decoder, diff --git a/crates/video-decode/src/lib.rs b/crates/video-decode/src/lib.rs index 6cd4e82171..ae408bc902 100644 --- a/crates/video-decode/src/lib.rs +++ b/crates/video-decode/src/lib.rs @@ -8,4 +8,7 @@ pub mod media_foundation; pub use avassetreader::AVAssetReaderDecoder; pub use ffmpeg::FFmpegDecoder; #[cfg(target_os = "windows")] -pub use media_foundation::{MFDecodedFrame, MediaFoundationDecoder, NV12Data}; +pub use media_foundation::{ + MFDecodedFrame, MFDecoderCapabilities, MediaFoundationDecoder, NV12Data, + get_mf_decoder_capabilities, +}; diff --git a/crates/video-decode/src/media_foundation.rs b/crates/video-decode/src/media_foundation.rs index 2b635071d2..a2aa9e2c34 100644 --- a/crates/video-decode/src/media_foundation.rs +++ b/crates/video-decode/src/media_foundation.rs @@ -1,16 +1,19 @@ use std::path::Path; -use tracing::info; +use std::sync::OnceLock; +use tracing::{info, warn}; use windows::{ Win32::{ Foundation::{HANDLE, HMODULE}, Graphics::{ - Direct3D::D3D_DRIVER_TYPE_HARDWARE, + Direct3D::{D3D_DRIVER_TYPE_HARDWARE, D3D_FEATURE_LEVEL}, Direct3D11::{ D3D11_BIND_SHADER_RESOURCE, D3D11_CPU_ACCESS_READ, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_CREATE_DEVICE_VIDEO_SUPPORT, + D3D11_DECODER_PROFILE_H264_VLD_NOFGT, D3D11_DECODER_PROFILE_HEVC_VLD_MAIN, D3D11_MAP_READ, D3D11_MAPPED_SUBRESOURCE, D3D11_SDK_VERSION, D3D11_TEXTURE2D_DESC, - D3D11_USAGE_DEFAULT, D3D11_USAGE_STAGING, D3D11CreateDevice, ID3D11Device, - ID3D11DeviceContext, ID3D11Texture2D, + D3D11_USAGE_DEFAULT, D3D11_USAGE_STAGING, D3D11_VIDEO_DECODER_DESC, + D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, ID3D11Texture2D, + ID3D11VideoDevice, }, Dxgi::Common::{DXGI_FORMAT_NV12, DXGI_SAMPLE_DESC}, }, @@ -28,6 +31,112 @@ use windows::{ core::{Interface, PCWSTR}, }; +#[derive(Debug, Clone)] +pub struct MFDecoderCapabilities { + pub max_width: u32, + pub max_height: u32, + pub supports_h264: bool, + pub supports_hevc: bool, + pub feature_level: D3D_FEATURE_LEVEL, +} + +impl Default for MFDecoderCapabilities { + fn default() -> Self { + Self { + max_width: 4096, + max_height: 4096, + supports_h264: true, + supports_hevc: false, + feature_level: windows::Win32::Graphics::Direct3D::D3D_FEATURE_LEVEL_11_0, + } + } +} + +static MF_CAPABILITIES: OnceLock = OnceLock::new(); + +fn query_mf_decoder_capabilities(device: &ID3D11Device) -> MFDecoderCapabilities { + let result: Result = (|| { + let video_device: ID3D11VideoDevice = device + .cast() + .map_err(|e| format!("Failed to get ID3D11VideoDevice: {e:?}"))?; + + let feature_level = unsafe { device.GetFeatureLevel() }; + + let mut max_width = 4096u32; + let mut max_height = 4096u32; + let mut supports_h264 = false; + let mut supports_hevc = false; + + let test_resolutions = [(8192, 8192), (7680, 4320), (5120, 2880), (4096, 4096)]; + + for &(test_w, test_h) in &test_resolutions { + let h264_desc = D3D11_VIDEO_DECODER_DESC { + Guid: D3D11_DECODER_PROFILE_H264_VLD_NOFGT, + SampleWidth: test_w, + SampleHeight: test_h, + OutputFormat: DXGI_FORMAT_NV12, + }; + + if let Ok(config_count) = unsafe { video_device.GetVideoDecoderConfigCount(&h264_desc) } + && config_count > 0 + { + supports_h264 = true; + max_width = max_width.max(test_w); + max_height = max_height.max(test_h); + break; + } + } + + for &(test_w, test_h) in &test_resolutions { + let hevc_desc = D3D11_VIDEO_DECODER_DESC { + Guid: D3D11_DECODER_PROFILE_HEVC_VLD_MAIN, + SampleWidth: test_w, + SampleHeight: test_h, + OutputFormat: DXGI_FORMAT_NV12, + }; + + if let Ok(config_count) = unsafe { video_device.GetVideoDecoderConfigCount(&hevc_desc) } + && config_count > 0 + { + supports_hevc = true; + max_width = max_width.max(test_w); + max_height = max_height.max(test_h); + break; + } + } + + Ok(MFDecoderCapabilities { + max_width, + max_height, + supports_h264, + supports_hevc, + feature_level, + }) + })(); + + match result { + Ok(caps) => { + info!( + max_width = caps.max_width, + max_height = caps.max_height, + supports_h264 = caps.supports_h264, + supports_hevc = caps.supports_hevc, + feature_level = ?caps.feature_level, + "MediaFoundation decoder capabilities detected" + ); + caps + } + Err(e) => { + warn!("Failed to query MediaFoundation decoder capabilities: {e}, using defaults"); + MFDecoderCapabilities::default() + } + } +} + +pub fn get_mf_decoder_capabilities() -> Option<&'static MFDecoderCapabilities> { + MF_CAPABILITIES.get() +} + pub struct MFDecodedFrame { pub texture: ID3D11Texture2D, pub shared_handle: Option, @@ -46,6 +155,142 @@ pub struct NV12Data { pub uv_stride: u32, } +struct TexturePool { + output_texture: Option, + y_texture: Option, + uv_texture: Option, + width: u32, + height: u32, +} + +impl TexturePool { + fn new() -> Self { + Self { + output_texture: None, + y_texture: None, + uv_texture: None, + width: 0, + height: 0, + } + } + + fn get_or_create_output_texture( + &mut self, + device: &ID3D11Device, + width: u32, + height: u32, + ) -> Result<&ID3D11Texture2D, String> { + if self.width != width || self.height != height || self.output_texture.is_none() { + let desc = D3D11_TEXTURE2D_DESC { + Width: width, + Height: height, + MipLevels: 1, + ArraySize: 1, + Format: DXGI_FORMAT_NV12, + SampleDesc: DXGI_SAMPLE_DESC { + Count: 1, + Quality: 0, + }, + Usage: D3D11_USAGE_DEFAULT, + BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32, + CPUAccessFlags: 0, + MiscFlags: 0, + }; + + let texture = unsafe { + let mut tex: Option = None; + device + .CreateTexture2D(&desc, None, Some(&mut tex)) + .map_err(|e| format!("CreateTexture2D failed: {e:?}"))?; + tex.ok_or("CreateTexture2D returned null")? + }; + + self.output_texture = Some(texture); + self.width = width; + self.height = height; + self.y_texture = None; + self.uv_texture = None; + } + + self.output_texture + .as_ref() + .ok_or_else(|| "Output texture not initialized".to_string()) + } + + fn get_or_create_yuv_textures( + &mut self, + device: &ID3D11Device, + width: u32, + height: u32, + ) -> Result<(&ID3D11Texture2D, &ID3D11Texture2D), String> { + use windows::Win32::Graphics::Dxgi::Common::{ + DXGI_FORMAT_R8_UNORM, DXGI_FORMAT_R8G8_UNORM, + }; + + if self.width != width || self.height != height || self.y_texture.is_none() { + let y_desc = D3D11_TEXTURE2D_DESC { + Width: width, + Height: height, + MipLevels: 1, + ArraySize: 1, + Format: DXGI_FORMAT_R8_UNORM, + SampleDesc: DXGI_SAMPLE_DESC { + Count: 1, + Quality: 0, + }, + Usage: D3D11_USAGE_DEFAULT, + BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32, + CPUAccessFlags: 0, + MiscFlags: 0, + }; + + let y_texture = unsafe { + let mut tex: Option = None; + device + .CreateTexture2D(&y_desc, None, Some(&mut tex)) + .map_err(|e| format!("CreateTexture2D Y failed: {e:?}"))?; + tex.ok_or("CreateTexture2D Y returned null")? + }; + + let uv_desc = D3D11_TEXTURE2D_DESC { + Width: width / 2, + Height: height / 2, + MipLevels: 1, + ArraySize: 1, + Format: DXGI_FORMAT_R8G8_UNORM, + SampleDesc: DXGI_SAMPLE_DESC { + Count: 1, + Quality: 0, + }, + Usage: D3D11_USAGE_DEFAULT, + BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32, + CPUAccessFlags: 0, + MiscFlags: 0, + }; + + let uv_texture = unsafe { + let mut tex: Option = None; + device + .CreateTexture2D(&uv_desc, None, Some(&mut tex)) + .map_err(|e| format!("CreateTexture2D UV failed: {e:?}"))?; + tex.ok_or("CreateTexture2D UV returned null")? + }; + + self.y_texture = Some(y_texture); + self.uv_texture = Some(uv_texture); + self.width = width; + self.height = height; + } + + Ok(( + self.y_texture.as_ref().ok_or("Y texture not initialized")?, + self.uv_texture + .as_ref() + .ok_or("UV texture not initialized")?, + )) + } +} + pub struct MediaFoundationDecoder { source_reader: IMFSourceReader, d3d11_device: ID3D11Device, @@ -58,6 +303,8 @@ pub struct MediaFoundationDecoder { staging_texture: Option, staging_width: u32, staging_height: u32, + texture_pool: TexturePool, + capabilities: MFDecoderCapabilities, } struct MFInitGuard; @@ -98,9 +345,26 @@ impl MediaFoundationDecoder { let (width, height, frame_rate_num, frame_rate_den) = unsafe { get_video_info(&source_reader)? }; + let capabilities = MF_CAPABILITIES + .get_or_init(|| query_mf_decoder_capabilities(&d3d11_device)) + .clone(); + + if width > capabilities.max_width || height > capabilities.max_height { + warn!( + video_width = width, + video_height = height, + max_width = capabilities.max_width, + max_height = capabilities.max_height, + "Video dimensions exceed detected hardware decoder limits" + ); + } + info!( - "MediaFoundation decoder initialized: {}x{} @ {}/{}fps", - width, height, frame_rate_num, frame_rate_den + width = width, + height = height, + frame_rate = format!("{}/{}", frame_rate_num, frame_rate_den), + max_hw_resolution = format!("{}x{}", capabilities.max_width, capabilities.max_height), + "MediaFoundation decoder initialized" ); std::mem::forget(guard); @@ -117,6 +381,8 @@ impl MediaFoundationDecoder { staging_texture: None, staging_width: 0, staging_height: 0, + texture_pool: TexturePool::new(), + capabilities, }) } @@ -136,6 +402,10 @@ impl MediaFoundationDecoder { &self.d3d11_device } + pub fn capabilities(&self) -> &MFDecoderCapabilities { + &self.capabilities + } + pub fn read_texture_to_cpu( &mut self, texture: &ID3D11Texture2D, @@ -280,38 +550,76 @@ impl MediaFoundationDecoder { .map_err(|e| format!("GetSubresourceIndex failed: {e:?}"))? }; - let (output_texture, shared_handle) = unsafe { - copy_texture_subresource( - &self.d3d11_device, - &self.d3d11_context, + let output_texture = self + .texture_pool + .get_or_create_output_texture(&self.d3d11_device, self.width, self.height)? + .clone(); + + unsafe { + self.d3d11_context.CopySubresourceRegion( + &output_texture, + 0, + 0, + 0, + 0, &texture, subresource_index, - self.width, - self.height, - )? - }; + None, + ); + } - let yuv_planes = unsafe { - create_yuv_plane_textures( + let shared_handle = None; + + let (y_texture, y_handle, uv_texture, uv_handle) = { + let (y_tex, uv_tex) = self.texture_pool.get_or_create_yuv_textures( &self.d3d11_device, - &self.d3d11_context, - &output_texture, self.width, self.height, - ) - .ok() - }; + )?; - let (y_texture, y_handle, uv_texture, uv_handle) = yuv_planes - .map(|p| { - ( - Some(p.y_texture), - p.y_handle, - Some(p.uv_texture), - p.uv_handle, - ) - }) - .unwrap_or((None, None, None, None)); + let y_texture = y_tex.clone(); + let uv_texture = uv_tex.clone(); + + unsafe { + self.d3d11_context.CopySubresourceRegion( + &y_texture, + 0, + 0, + 0, + 0, + &output_texture, + 0, + Some(&windows::Win32::Graphics::Direct3D11::D3D11_BOX { + left: 0, + top: 0, + front: 0, + right: self.width, + bottom: self.height, + back: 1, + }), + ); + + self.d3d11_context.CopySubresourceRegion( + &uv_texture, + 0, + 0, + 0, + 0, + &output_texture, + 1, + Some(&windows::Win32::Graphics::Direct3D11::D3D11_BOX { + left: 0, + top: 0, + front: 0, + right: self.width / 2, + bottom: self.height / 2, + back: 1, + }), + ); + } + + (Some(y_texture), None, Some(uv_texture), None) + }; Ok(Some(MFDecodedFrame { texture: output_texture, @@ -513,154 +821,4 @@ unsafe fn get_video_info(source_reader: &IMFSourceReader) -> Result<(u32, u32, u Ok((width, height, frame_rate_num, frame_rate_den.max(1))) } -struct YuvPlaneTextures { - y_texture: ID3D11Texture2D, - y_handle: Option, - uv_texture: ID3D11Texture2D, - uv_handle: Option, -} - -unsafe fn create_yuv_plane_textures( - device: &ID3D11Device, - context: &ID3D11DeviceContext, - nv12_texture: &ID3D11Texture2D, - width: u32, - height: u32, -) -> Result { - use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_R8_UNORM, DXGI_FORMAT_R8G8_UNORM}; - - let y_desc = D3D11_TEXTURE2D_DESC { - Width: width, - Height: height, - MipLevels: 1, - ArraySize: 1, - Format: DXGI_FORMAT_R8_UNORM, - SampleDesc: DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - Usage: D3D11_USAGE_DEFAULT, - BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32, - CPUAccessFlags: 0, - MiscFlags: 0, - }; - - let mut y_texture: Option = None; - unsafe { - device - .CreateTexture2D(&y_desc, None, Some(&mut y_texture)) - .map_err(|e| format!("CreateTexture2D Y failed: {e:?}"))?; - } - let y_texture = y_texture.ok_or("CreateTexture2D Y returned null")?; - - let uv_desc = D3D11_TEXTURE2D_DESC { - Width: width / 2, - Height: height / 2, - MipLevels: 1, - ArraySize: 1, - Format: DXGI_FORMAT_R8G8_UNORM, - SampleDesc: DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - Usage: D3D11_USAGE_DEFAULT, - BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32, - CPUAccessFlags: 0, - MiscFlags: 0, - }; - - let mut uv_texture: Option = None; - unsafe { - device - .CreateTexture2D(&uv_desc, None, Some(&mut uv_texture)) - .map_err(|e| format!("CreateTexture2D UV failed: {e:?}"))?; - } - let uv_texture = uv_texture.ok_or("CreateTexture2D UV returned null")?; - - unsafe { - context.CopySubresourceRegion( - &y_texture, - 0, - 0, - 0, - 0, - nv12_texture, - 0, - Some(&windows::Win32::Graphics::Direct3D11::D3D11_BOX { - left: 0, - top: 0, - front: 0, - right: width, - bottom: height, - back: 1, - }), - ); - - context.CopySubresourceRegion( - &uv_texture, - 0, - 0, - 0, - 0, - nv12_texture, - 1, - Some(&windows::Win32::Graphics::Direct3D11::D3D11_BOX { - left: 0, - top: 0, - front: 0, - right: width / 2, - bottom: height / 2, - back: 1, - }), - ); - } - - Ok(YuvPlaneTextures { - y_texture, - y_handle: None, - uv_texture, - uv_handle: None, - }) -} - -unsafe fn copy_texture_subresource( - device: &ID3D11Device, - context: &ID3D11DeviceContext, - source: &ID3D11Texture2D, - subresource_index: u32, - width: u32, - height: u32, -) -> Result<(ID3D11Texture2D, Option), String> { - let desc = D3D11_TEXTURE2D_DESC { - Width: width, - Height: height, - MipLevels: 1, - ArraySize: 1, - Format: DXGI_FORMAT_NV12, - SampleDesc: DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - Usage: D3D11_USAGE_DEFAULT, - BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32, - CPUAccessFlags: 0, - MiscFlags: 0, - }; - - let mut output_texture: Option = None; - unsafe { - device - .CreateTexture2D(&desc, None, Some(&mut output_texture)) - .map_err(|e| format!("CreateTexture2D failed: {e:?}"))?; - } - - let output_texture = output_texture.ok_or("CreateTexture2D returned null")?; - - unsafe { - context.CopySubresourceRegion(&output_texture, 0, 0, 0, 0, source, subresource_index, None); - } - - Ok((output_texture, None)) -} - unsafe impl Send for MediaFoundationDecoder {}