@@ -303,6 +303,33 @@ static FORCE_INLINE void mov_rdata_avx(__m256i* dst, const __m256i* src)
303303}
304304#endif
305305
306+ // Check if only a single 16-bytes block has changed
307+ // Returning its position, or -1 if that is not the situation
308+ static inline usz scan16_rdata (const decltype (spu_thread::rdata)& _lhs, const decltype(spu_thread::rdata)& _rhs)
309+ {
310+ const auto lhs = reinterpret_cast <const v128*>(_lhs);
311+ const auto rhs = reinterpret_cast <const v128*>(_rhs);
312+
313+ u32 mask = 0 ;
314+
315+ for (usz i = 0 ; i < 8 ; i += 4 )
316+ {
317+ const u32 a = (lhs[i + 0 ] != rhs[i + 0 ]) ? 1 : 0 ;
318+ const u32 b = (lhs[i + 1 ] != rhs[i + 1 ]) ? 1 : 0 ;
319+ const u32 c = (lhs[i + 2 ] != rhs[i + 2 ]) ? 1 : 0 ;
320+ const u32 d = (lhs[i + 3 ] != rhs[i + 3 ]) ? 1 : 0 ;
321+
322+ mask |= ((a << 0 ) + (b << 1 ) + (c << 2 ) + (c << 3 )) << i;
323+ }
324+
325+ if (mask && (mask & (mask - 1 )) == 0 )
326+ {
327+ return std::countr_zero (mask);
328+ }
329+
330+ return umax;
331+ }
332+
306333#ifdef _MSC_VER
307334__forceinline
308335#endif
@@ -3854,6 +3881,11 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
38543881 return false ;
38553882 }
38563883
3884+ static const auto cast_as = [](void * ptr, usz pos){ return reinterpret_cast <u128 *>(ptr) + pos; };
3885+ static const auto cast_as_const = [](const void * ptr, usz pos){ return reinterpret_cast <const u128 *>(ptr) + pos; };
3886+
3887+ const usz diff16_pos = scan16_rdata (to_write, rdata);
3888+
38573889 auto [_oldd, _ok] = res.fetch_op ([&](u64 & r)
38583890 {
38593891 if ((r & -128 ) != rtime || (r & 127 ))
@@ -3975,8 +4007,19 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
39754007
39764008 if (cmp_rdata (rdata, super_data))
39774009 {
3978- mov_rdata (super_data, to_write);
3979- return true ;
4010+ if (diff16_pos != umax)
4011+ {
4012+ // Do it with CMPXCHG16B if possible, this allows to improve accuracy whenever "RSX Accurate Reservations" is off
4013+ if (atomic_storage<u128 >::compare_exchange (*cast_as (super_data, diff16_pos), *cast_as (rdata, diff16_pos), *cast_as_const (to_write, diff16_pos)))
4014+ {
4015+ return true ;
4016+ }
4017+ }
4018+ else
4019+ {
4020+ mov_rdata (super_data, to_write);
4021+ return true ;
4022+ }
39804023 }
39814024
39824025 return false ;
0 commit comments