Skip to content

Commit c5fa2f0

Browse files
committed
[D3D12] Fix cross-format EDRAM data written to gamma_unorm16 render targets
The color_packed_in_r0x_and_r1x output path used UBFE to write uint values to a float-typed R16G16B16A16_UNORM output, type-punning them as denormalized floats (~0). Affeced transfers: k_2_10_10_10_FLOAT → k_8_8_8_8_GAMMA in Halo Reach cause black textures on rocks and plants. To work around this we extracts 4 gamma bytes from the packed EDRAM dword and convert them to linear floats using midpoint encoding — centering values in the valid UNORM16 range so they survive the round-trip through PreSaturatedLinearToPWLGamma's trunc() in the dump shader.
1 parent f7b9a87 commit c5fa2f0

File tree

1 file changed

+69
-0
lines changed

1 file changed

+69
-0
lines changed

src/xenia/gpu/d3d12/d3d12_render_target_cache.cc

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3482,6 +3482,75 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
34823482
if (dest_color_format == xenos::ColorRenderTargetFormat::k_32_32_FLOAT) {
34833483
a.OpMov(dxbc::Dest::O(0, 0b0001), dxbc::Src::R(0, dxbc::Src::kXXXX));
34843484
a.OpMov(dxbc::Dest::O(0, 0b0010), dxbc::Src::R(1, dxbc::Src::kXXXX));
3485+
} else if (dest_is_gamma_unorm16) {
3486+
// For gamma_unorm16, only r1.x has a valid packed 32-bit EDRAM
3487+
// dword (one pixel, since gamma_unorm16 is 32bpp Xenos / 64bpp
3488+
// host). Reinterpret as k_8_8_8_8_GAMMA (4 gamma-encoded bytes)
3489+
// and convert to linear float for R16G16B16A16_UNORM output.
3490+
//
3491+
// Use midpoint encoding to survive the UNORM16 quantization
3492+
// round-trip through PreSaturatedLinearToPWLGamma (which uses
3493+
// trunc()). Storing the exact PWLGammaToLinear result puts values
3494+
// at the lower boundary of the valid range, where UNORM16
3495+
// quantization can push them below threshold causing +/-1 byte
3496+
// errors that corrupt cross-format EDRAM reinterpretation.
3497+
//
3498+
// For gamma byte B, the midpoint linear value is:
3499+
// Piece 0 (B < 64): F = (B + 0.5) / 1023.0
3500+
// Piece 1 (64<=B<96): F = (B - 31.5) / 511.5
3501+
// Piece 2 (96<=B<192): F = (B - 63.5) / 255.75
3502+
// Piece 3 (B >= 192): F = (B - 127.5) / 127.875
3503+
// Using MAd form: F = B * recip + offset.
3504+
3505+
// Extract 4 bytes: r1.xyzw = [R, G, B, A] as uint.
3506+
a.OpUBFE(dxbc::Dest::R(1), dxbc::Src::LU(8, 8, 8, 8),
3507+
dxbc::Src::LU(0, 8, 16, 24),
3508+
dxbc::Src::R(1, dxbc::Src::kXXXX));
3509+
3510+
// Alpha: o0.w = float(A) / 255.0 (no gamma conversion).
3511+
a.OpUToF(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(1, dxbc::Src::kWWWW));
3512+
a.OpMul(dxbc::Dest::O(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW),
3513+
dxbc::Src::LF(1.0f / 255.0f));
3514+
3515+
// RGB: per-channel midpoint encoding using r0.xy as (recip,
3516+
// offset) and r2.x as comparison temp.
3517+
for (uint32_t j = 0; j < 3; ++j) {
3518+
// Default to piece 0.
3519+
a.OpMov(dxbc::Dest::R(0, 0b0001), dxbc::Src::LF(1.0f / 1023.0f));
3520+
a.OpMov(dxbc::Dest::R(0, 0b0010), dxbc::Src::LF(0.5f / 1023.0f));
3521+
// Piece 1: byte >= 64.
3522+
a.OpUGE(dxbc::Dest::R(2, 0b0001), dxbc::Src::R(1).Select(j),
3523+
dxbc::Src::LU(64));
3524+
a.OpMovC(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(2, dxbc::Src::kXXXX),
3525+
dxbc::Src::LF(1.0f / 511.5f),
3526+
dxbc::Src::R(0, dxbc::Src::kXXXX));
3527+
a.OpMovC(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(2, dxbc::Src::kXXXX),
3528+
dxbc::Src::LF(-31.5f / 511.5f),
3529+
dxbc::Src::R(0, dxbc::Src::kYYYY));
3530+
// Piece 2: byte >= 96.
3531+
a.OpUGE(dxbc::Dest::R(2, 0b0001), dxbc::Src::R(1).Select(j),
3532+
dxbc::Src::LU(96));
3533+
a.OpMovC(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(2, dxbc::Src::kXXXX),
3534+
dxbc::Src::LF(1.0f / 255.75f),
3535+
dxbc::Src::R(0, dxbc::Src::kXXXX));
3536+
a.OpMovC(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(2, dxbc::Src::kXXXX),
3537+
dxbc::Src::LF(-63.5f / 255.75f),
3538+
dxbc::Src::R(0, dxbc::Src::kYYYY));
3539+
// Piece 3: byte >= 192.
3540+
a.OpUGE(dxbc::Dest::R(2, 0b0001), dxbc::Src::R(1).Select(j),
3541+
dxbc::Src::LU(192));
3542+
a.OpMovC(dxbc::Dest::R(0, 0b0001), dxbc::Src::R(2, dxbc::Src::kXXXX),
3543+
dxbc::Src::LF(1.0f / 127.875f),
3544+
dxbc::Src::R(0, dxbc::Src::kXXXX));
3545+
a.OpMovC(dxbc::Dest::R(0, 0b0010), dxbc::Src::R(2, dxbc::Src::kXXXX),
3546+
dxbc::Src::LF(-127.5f / 127.875f),
3547+
dxbc::Src::R(0, dxbc::Src::kYYYY));
3548+
// F = float(byte) * recip + offset.
3549+
a.OpUToF(dxbc::Dest::R(2, 0b0001), dxbc::Src::R(1).Select(j));
3550+
a.OpMAd(dxbc::Dest::O(0, 1 << j), dxbc::Src::R(2, dxbc::Src::kXXXX),
3551+
dxbc::Src::R(0, dxbc::Src::kXXXX),
3552+
dxbc::Src::R(0, dxbc::Src::kYYYY));
3553+
}
34853554
} else {
34863555
for (uint32_t i = 0; i < 2; ++i) {
34873556
a.OpUBFE(dxbc::Dest::O(0, 0b11 << (i * 2)), dxbc::Src::LU(16),

0 commit comments

Comments
 (0)