@@ -3482,6 +3482,75 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
34823482 if (dest_color_format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) {
34833483 a.OpMov (dxbc::Dest::O (0 , 0b0001 ), dxbc::Src::R (0 , dxbc::Src::kXXXX ));
34843484 a.OpMov (dxbc::Dest::O (0 , 0b0010 ), dxbc::Src::R (1 , dxbc::Src::kXXXX ));
3485+ } else if (dest_is_gamma_unorm16) {
3486+ // For gamma_unorm16, only r1.x has a valid packed 32-bit EDRAM
3487+ // dword (one pixel, since gamma_unorm16 is 32bpp Xenos / 64bpp
3488+ // host). Reinterpret as k_8_8_8_8_GAMMA (4 gamma-encoded bytes)
3489+ // and convert to linear float for R16G16B16A16_UNORM output.
3490+ //
3491+ // Use midpoint encoding to survive the UNORM16 quantization
3492+ // round-trip through PreSaturatedLinearToPWLGamma (which uses
3493+ // trunc()). Storing the exact PWLGammaToLinear result puts values
3494+ // at the lower boundary of the valid range, where UNORM16
3495+ // quantization can push them below threshold causing +/-1 byte
3496+ // errors that corrupt cross-format EDRAM reinterpretation.
3497+ //
3498+ // For gamma byte B, the midpoint linear value is:
3499+ // Piece 0 (B < 64): F = (B + 0.5) / 1023.0
3500+ // Piece 1 (64<=B<96): F = (B - 31.5) / 511.5
3501+ // Piece 2 (96<=B<192): F = (B - 63.5) / 255.75
3502+ // Piece 3 (B >= 192): F = (B - 127.5) / 127.875
3503+ // Using MAd form: F = B * recip + offset.
3504+
3505+ // Extract 4 bytes: r1.xyzw = [R, G, B, A] as uint.
3506+ a.OpUBFE (dxbc::Dest::R (1 ), dxbc::Src::LU (8 , 8 , 8 , 8 ),
3507+ dxbc::Src::LU (0 , 8 , 16 , 24 ),
3508+ dxbc::Src::R (1 , dxbc::Src::kXXXX ));
3509+
3510+ // Alpha: o0.w = float(A) / 255.0 (no gamma conversion).
3511+ a.OpUToF (dxbc::Dest::R (0 , 0b1000 ), dxbc::Src::R (1 , dxbc::Src::kWWWW ));
3512+ a.OpMul (dxbc::Dest::O (0 , 0b1000 ), dxbc::Src::R (0 , dxbc::Src::kWWWW ),
3513+ dxbc::Src::LF (1 .0f / 255 .0f ));
3514+
3515+ // RGB: per-channel midpoint encoding using r0.xy as (recip,
3516+ // offset) and r2.x as comparison temp.
3517+ for (uint32_t j = 0 ; j < 3 ; ++j) {
3518+ // Default to piece 0.
3519+ a.OpMov (dxbc::Dest::R (0 , 0b0001 ), dxbc::Src::LF (1 .0f / 1023 .0f ));
3520+ a.OpMov (dxbc::Dest::R (0 , 0b0010 ), dxbc::Src::LF (0 .5f / 1023 .0f ));
3521+ // Piece 1: byte >= 64.
3522+ a.OpUGE (dxbc::Dest::R (2 , 0b0001 ), dxbc::Src::R (1 ).Select (j),
3523+ dxbc::Src::LU (64 ));
3524+ a.OpMovC (dxbc::Dest::R (0 , 0b0001 ), dxbc::Src::R (2 , dxbc::Src::kXXXX ),
3525+ dxbc::Src::LF (1 .0f / 511 .5f ),
3526+ dxbc::Src::R (0 , dxbc::Src::kXXXX ));
3527+ a.OpMovC (dxbc::Dest::R (0 , 0b0010 ), dxbc::Src::R (2 , dxbc::Src::kXXXX ),
3528+ dxbc::Src::LF (-31 .5f / 511 .5f ),
3529+ dxbc::Src::R (0 , dxbc::Src::kYYYY ));
3530+ // Piece 2: byte >= 96.
3531+ a.OpUGE (dxbc::Dest::R (2 , 0b0001 ), dxbc::Src::R (1 ).Select (j),
3532+ dxbc::Src::LU (96 ));
3533+ a.OpMovC (dxbc::Dest::R (0 , 0b0001 ), dxbc::Src::R (2 , dxbc::Src::kXXXX ),
3534+ dxbc::Src::LF (1 .0f / 255 .75f ),
3535+ dxbc::Src::R (0 , dxbc::Src::kXXXX ));
3536+ a.OpMovC (dxbc::Dest::R (0 , 0b0010 ), dxbc::Src::R (2 , dxbc::Src::kXXXX ),
3537+ dxbc::Src::LF (-63 .5f / 255 .75f ),
3538+ dxbc::Src::R (0 , dxbc::Src::kYYYY ));
3539+ // Piece 3: byte >= 192.
3540+ a.OpUGE (dxbc::Dest::R (2 , 0b0001 ), dxbc::Src::R (1 ).Select (j),
3541+ dxbc::Src::LU (192 ));
3542+ a.OpMovC (dxbc::Dest::R (0 , 0b0001 ), dxbc::Src::R (2 , dxbc::Src::kXXXX ),
3543+ dxbc::Src::LF (1 .0f / 127 .875f ),
3544+ dxbc::Src::R (0 , dxbc::Src::kXXXX ));
3545+ a.OpMovC (dxbc::Dest::R (0 , 0b0010 ), dxbc::Src::R (2 , dxbc::Src::kXXXX ),
3546+ dxbc::Src::LF (-127 .5f / 127 .875f ),
3547+ dxbc::Src::R (0 , dxbc::Src::kYYYY ));
3548+ // F = float(byte) * recip + offset.
3549+ a.OpUToF (dxbc::Dest::R (2 , 0b0001 ), dxbc::Src::R (1 ).Select (j));
3550+ a.OpMAd (dxbc::Dest::O (0 , 1 << j), dxbc::Src::R (2 , dxbc::Src::kXXXX ),
3551+ dxbc::Src::R (0 , dxbc::Src::kXXXX ),
3552+ dxbc::Src::R (0 , dxbc::Src::kYYYY ));
3553+ }
34853554 } else {
34863555 for (uint32_t i = 0 ; i < 2 ; ++i) {
34873556 a.OpUBFE (dxbc::Dest::O (0 , 0b11 << (i * 2 )), dxbc::Src::LU (16 ),
0 commit comments