@@ -986,38 +986,15 @@ void xe_device_wmb(struct xe_device *xe)
986
986
xe_mmio_write32 (xe_root_tile_mmio (xe ), VF_CAP_REG , 0 );
987
987
}
988
988
989
- /**
990
- * xe_device_td_flush() - Flush transient L3 cache entries
991
- * @xe: The device
992
- *
993
- * Display engine has direct access to memory and is never coherent with L3/L4
994
- * caches (or CPU caches), however KMD is responsible for specifically flushing
995
- * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
996
- * can happen from such a surface without seeing corruption.
997
- *
998
- * Display surfaces can be tagged as transient by mapping it using one of the
999
- * various L3:XD PAT index modes on Xe2.
1000
- *
1001
- * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed
1002
- * at the end of each submission via PIPE_CONTROL for compute/render, since SA
1003
- * Media is not coherent with L3 and we want to support render-vs-media
1004
- * usescases. For other engines like copy/blt the HW internally forces uncached
1005
- * behaviour, hence why we can skip the TDF on such platforms.
989
+ /*
990
+ * Issue a TRANSIENT_FLUSH_REQUEST and wait for completion on each gt.
1006
991
*/
1007
- void xe_device_td_flush (struct xe_device * xe )
992
+ static void tdf_request_sync (struct xe_device * xe )
1008
993
{
1009
- struct xe_gt * gt ;
1010
994
unsigned int fw_ref ;
995
+ struct xe_gt * gt ;
1011
996
u8 id ;
1012
997
1013
- if (!IS_DGFX (xe ) || GRAPHICS_VER (xe ) < 20 )
1014
- return ;
1015
-
1016
- if (XE_WA (xe_root_mmio_gt (xe ), 16023588340 )) {
1017
- xe_device_l2_flush (xe );
1018
- return ;
1019
- }
1020
-
1021
998
for_each_gt (gt , xe , id ) {
1022
999
if (xe_gt_is_media_type (gt ))
1023
1000
continue ;
@@ -1027,6 +1004,7 @@ void xe_device_td_flush(struct xe_device *xe)
1027
1004
return ;
1028
1005
1029
1006
xe_mmio_write32 (& gt -> mmio , XE2_TDF_CTRL , TRANSIENT_FLUSH_REQUEST );
1007
+
1030
1008
/*
1031
1009
* FIXME: We can likely do better here with our choice of
1032
1010
* timeout. Currently we just assume the worst case, i.e. 150us,
@@ -1057,15 +1035,49 @@ void xe_device_l2_flush(struct xe_device *xe)
1057
1035
return ;
1058
1036
1059
1037
spin_lock (& gt -> global_invl_lock );
1060
- xe_mmio_write32 (& gt -> mmio , XE2_GLOBAL_INVAL , 0x1 );
1061
1038
1039
+ xe_mmio_write32 (& gt -> mmio , XE2_GLOBAL_INVAL , 0x1 );
1062
1040
if (xe_mmio_wait32 (& gt -> mmio , XE2_GLOBAL_INVAL , 0x1 , 0x0 , 500 , NULL , true))
1063
1041
xe_gt_err_once (gt , "Global invalidation timeout\n" );
1042
+
1064
1043
spin_unlock (& gt -> global_invl_lock );
1065
1044
1066
1045
xe_force_wake_put (gt_to_fw (gt ), fw_ref );
1067
1046
}
1068
1047
1048
+ /**
1049
+ * xe_device_td_flush() - Flush transient L3 cache entries
1050
+ * @xe: The device
1051
+ *
1052
+ * Display engine has direct access to memory and is never coherent with L3/L4
1053
+ * caches (or CPU caches), however KMD is responsible for specifically flushing
1054
+ * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
1055
+ * can happen from such a surface without seeing corruption.
1056
+ *
1057
+ * Display surfaces can be tagged as transient by mapping it using one of the
1058
+ * various L3:XD PAT index modes on Xe2.
1059
+ *
1060
+ * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed
1061
+ * at the end of each submission via PIPE_CONTROL for compute/render, since SA
1062
+ * Media is not coherent with L3 and we want to support render-vs-media
1063
+ * usescases. For other engines like copy/blt the HW internally forces uncached
1064
+ * behaviour, hence why we can skip the TDF on such platforms.
1065
+ */
1066
+ void xe_device_td_flush (struct xe_device * xe )
1067
+ {
1068
+ struct xe_gt * root_gt ;
1069
+
1070
+ if (!IS_DGFX (xe ) || GRAPHICS_VER (xe ) < 20 )
1071
+ return ;
1072
+
1073
+ root_gt = xe_root_mmio_gt (xe );
1074
+ if (XE_WA (root_gt , 16023588340 ))
1075
+ /* A transient flush is not sufficient: flush the L2 */
1076
+ xe_device_l2_flush (xe );
1077
+ else
1078
+ tdf_request_sync (xe );
1079
+ }
1080
+
1069
1081
u32 xe_device_ccs_bytes (struct xe_device * xe , u64 size )
1070
1082
{
1071
1083
return xe_device_has_flat_ccs (xe ) ?
0 commit comments