Skip to content

Commit a1eec6c

Browse files
committed
drm/xe: Split xe_device_td_flush()
xe_device_td_flush() has 2 possible implementations: an entire L2 flush or a transient flush, depending on WA 16023588340. Make this clear by splitting the function so it calls each of them. Reviewed-by: Matthew Auld <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Lucas De Marchi <[email protected]> (cherry picked from commit 5e300ed) Signed-off-by: Lucas De Marchi <[email protected]>
1 parent 4cec909 commit a1eec6c

File tree

1 file changed

+40
-28
lines changed

1 file changed

+40
-28
lines changed

drivers/gpu/drm/xe/xe_device.c

Lines changed: 40 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -986,38 +986,15 @@ void xe_device_wmb(struct xe_device *xe)
986986
xe_mmio_write32(xe_root_tile_mmio(xe), VF_CAP_REG, 0);
987987
}
988988

989-
/**
990-
* xe_device_td_flush() - Flush transient L3 cache entries
991-
* @xe: The device
992-
*
993-
* Display engine has direct access to memory and is never coherent with L3/L4
994-
* caches (or CPU caches), however KMD is responsible for specifically flushing
995-
* transient L3 GPU cache entries prior to the flip sequence to ensure scanout
996-
* can happen from such a surface without seeing corruption.
997-
*
998-
* Display surfaces can be tagged as transient by mapping it using one of the
999-
* various L3:XD PAT index modes on Xe2.
1000-
*
1001-
* Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed
1002-
* at the end of each submission via PIPE_CONTROL for compute/render, since SA
1003-
* Media is not coherent with L3 and we want to support render-vs-media
1004-
* usescases. For other engines like copy/blt the HW internally forces uncached
1005-
* behaviour, hence why we can skip the TDF on such platforms.
989+
/*
990+
* Issue a TRANSIENT_FLUSH_REQUEST and wait for completion on each gt.
1006991
*/
1007-
void xe_device_td_flush(struct xe_device *xe)
992+
static void tdf_request_sync(struct xe_device *xe)
1008993
{
1009-
struct xe_gt *gt;
1010994
unsigned int fw_ref;
995+
struct xe_gt *gt;
1011996
u8 id;
1012997

1013-
if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
1014-
return;
1015-
1016-
if (XE_WA(xe_root_mmio_gt(xe), 16023588340)) {
1017-
xe_device_l2_flush(xe);
1018-
return;
1019-
}
1020-
1021998
for_each_gt(gt, xe, id) {
1022999
if (xe_gt_is_media_type(gt))
10231000
continue;
@@ -1027,6 +1004,7 @@ void xe_device_td_flush(struct xe_device *xe)
10271004
return;
10281005

10291006
xe_mmio_write32(&gt->mmio, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST);
1007+
10301008
/*
10311009
* FIXME: We can likely do better here with our choice of
10321010
* timeout. Currently we just assume the worst case, i.e. 150us,
@@ -1057,15 +1035,49 @@ void xe_device_l2_flush(struct xe_device *xe)
10571035
return;
10581036

10591037
spin_lock(&gt->global_invl_lock);
1060-
xe_mmio_write32(&gt->mmio, XE2_GLOBAL_INVAL, 0x1);
10611038

1039+
xe_mmio_write32(&gt->mmio, XE2_GLOBAL_INVAL, 0x1);
10621040
if (xe_mmio_wait32(&gt->mmio, XE2_GLOBAL_INVAL, 0x1, 0x0, 500, NULL, true))
10631041
xe_gt_err_once(gt, "Global invalidation timeout\n");
1042+
10641043
spin_unlock(&gt->global_invl_lock);
10651044

10661045
xe_force_wake_put(gt_to_fw(gt), fw_ref);
10671046
}
10681047

1048+
/**
1049+
* xe_device_td_flush() - Flush transient L3 cache entries
1050+
* @xe: The device
1051+
*
1052+
* Display engine has direct access to memory and is never coherent with L3/L4
1053+
* caches (or CPU caches), however KMD is responsible for specifically flushing
1054+
* transient L3 GPU cache entries prior to the flip sequence to ensure scanout
1055+
* can happen from such a surface without seeing corruption.
1056+
*
1057+
* Display surfaces can be tagged as transient by mapping it using one of the
1058+
* various L3:XD PAT index modes on Xe2.
1059+
*
1060+
* Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed
1061+
* at the end of each submission via PIPE_CONTROL for compute/render, since SA
1062+
* Media is not coherent with L3 and we want to support render-vs-media
1063+
* usescases. For other engines like copy/blt the HW internally forces uncached
1064+
* behaviour, hence why we can skip the TDF on such platforms.
1065+
*/
1066+
void xe_device_td_flush(struct xe_device *xe)
1067+
{
1068+
struct xe_gt *root_gt;
1069+
1070+
if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
1071+
return;
1072+
1073+
root_gt = xe_root_mmio_gt(xe);
1074+
if (XE_WA(root_gt, 16023588340))
1075+
/* A transient flush is not sufficient: flush the L2 */
1076+
xe_device_l2_flush(xe);
1077+
else
1078+
tdf_request_sync(xe);
1079+
}
1080+
10691081
u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
10701082
{
10711083
return xe_device_has_flat_ccs(xe) ?

0 commit comments

Comments
 (0)