SPU: SPURS oriented thread waiting

elad335 · elad335 · commit cafffb4a7cbe · 2025-11-02T09:50:25.000+02:00
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -490,6 +490,11 @@ namespace vm
 
 namespace spu
 {
+	struct raw_spu_atomic_info_t
+	{
+		std::array<atomic_t<spu_atomic_op_info_for_group>, 8> raw_atomic_ops;
+	};
+
 	namespace scheduler
 	{
 		std::array<atomic_t<u8>, 65536> atomic_instruction_table = {};
@@ -4699,6 +4704,154 @@ u32 evaluate_spin_optimization(std::span<u8> stats, u64 evaluate_time, const cfg
 	return busy_waiting_switch;
 }
 
+inline u8 spu_to_index(const spu_thread* spu) noexcept
+{
+	return spu->group ? (spu->lv2_id >> 24) : spu->lv2_id;
+}
+
+inline std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& get_spu_atomic_op_info(const spu_thread* spu) noexcept
+{
+	return spu->group ? spu->group->atomic_ops : g_fxo->get<spu::raw_spu_atomic_info_t>().raw_atomic_ops;
+}
+
+// To be used by GETLLAR
+// Returns none-zero if needs to wait
+int test_and_update_atomic_op_info(std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& spu_info, u8 index, u32 raddr, u32 getllar_pc)
+{
+	auto info = spu_info[index].load();
+
+	if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc)
+	{
+		if (info.addr % 128)
+		{
+			info.addr &= -128;
+			spu_info[index].release(info);
+			return 0;
+		}
+
+		// Repeated GETLLAR: disable entry
+	}
+
+	info = {};
+
+	spu_info[index].release(info);
+
+	for (usz i = 0; i < spu_info.size(); i++)
+	{
+		info = spu_info[i].load();
+
+		if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc)
+		{
+			int wait = 0;
+
+			spu_info[i].fetch_op([&](spu_atomic_op_info_for_group& value)
+			{
+				wait = 0;
+
+				if (value.addr / 128 == raddr / 128 && value.getllar == getllar_pc)
+				{
+					if (value.addr % 128 == 0)
+					{
+						wait = 2;
+						return false;
+					}
+
+					if (value.addr & (1u << index))
+					{
+						value.addr &= ~(1u << index);
+						wait = 1;
+						return true;
+					}
+				}
+
+				return false;
+			});
+
+			if (wait)
+			{
+				return wait;
+			}
+		}
+	}
+
+	return 0;
+}
+
+// To be used when PUTLLC finishes to create a temporary barrier until the SPURS loop restarts
+void downgrade_to_temporary_atomic_op_info(std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& spu_info, u8 index, u32 raddr, u32 getllar_pc)
+{
+	auto info = spu_info[index].load();
+
+	if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc)
+	{
+		info.addr |= 127;
+		spu_info[index].release(info);
+		return;
+	}
+
+	info = {};
+	spu_info[index].release(info);
+}
+
+// To be used by PUTLLC initiates
+// Returns none-zero if needs to wait
+int init_atomic_op_info(std::array<atomic_t<spu_atomic_op_info_for_group>, 8>& spu_info, u8 index, u32 raddr, u32 getllar_pc)
+{
+	// Initialiy store locked entry with temporary lock
+	spu_atomic_op_info_for_group info{};
+	info.addr = raddr | 127;
+	info.getllar = getllar_pc;
+
+	spu_info[index].release(info);
+
+	for (usz i = 0; i < spu_info.size(); i++)
+	{
+		if (i == index)
+		{
+			continue;
+		}
+
+		info = spu_info[i].load();
+
+		if (info.addr / 128 == raddr / 128 && info.getllar == getllar_pc)
+		{
+			int wait = 0;
+
+			spu_info[i].fetch_op([&](spu_atomic_op_info_for_group& value)
+			{
+				wait = 0;
+
+				if (value.addr / 128 == raddr / 128 && value.getllar == getllar_pc)
+				{
+					if (value.addr % 128 == 0)
+					{
+						wait = 2;
+						return false;
+					}
+
+					if (value.addr & (1u << index))
+					{
+						value.addr &= ~(1u << index);
+						wait = 1;
+						return true;
+					}
+				}
+
+				return false;
+			});
+
+			return wait;
+		}
+	}
+
+	// If exclusive, upgrade to full lock
+	info.addr = raddr;
+	info.getllar = getllar_pc;
+	spu_info[index].store(info);
+
+	return 0;
+}
+
 bool spu_thread::process_mfc_cmd()
 {
 	// Stall infinitely if MFC queue is full
@@ -5015,11 +5168,50 @@ bool spu_thread::process_mfc_cmd()
 			last_getllar = pc;
 			last_gtsc = perf0.get();
 		}
+		else
+		{
+			last_getllar = pc;
+		}
 
 		last_getllar_addr = addr;
 		getllar_spin_count = 0;
 		getllar_busy_waiting_switch = umax;
 
+		if (ch_mfc_cmd.eal == spurs_addr)
+		{
+			u64 timeout = 0;
+
+			while (true)
+			{
+				const int wait = test_and_update_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this), ch_mfc_cmd.eal, last_getllar);
+
+				if (!wait)
+				{
+					break;
+				}
+
+				const u64 current = get_system_time();
+
+				if (!timeout)
+				{
+					timeout = current + g_cfg.core.spu_delay_penalty * 1000;
+				}
+				else if (current >= timeout)
+				{
+					break;
+				}
+
+				if (wait == 2)
+				{
+					std::this_thread::yield();
+				}
+				else
+				{
+					busy_wait(50000);
+				}
+			}
+		}
+
 		u64 ntime = 0;
 		rsx::reservation_lock rsx_lock(addr, 128);
 
@@ -5232,6 +5424,41 @@ bool spu_thread::process_mfc_cmd()
 			}
 		}
 
+		if (ch_mfc_cmd.eal == spurs_addr)
+		{
+			u64 timeout = 0;
+
+			while (true)
+			{
+				const int wait = init_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this), raddr, last_getllar);
+
+				if (!wait)
+				{
+					break;
+				}
+
+				const u64 current = get_system_time();
+
+				if (!timeout)
+				{
+					timeout = current + g_cfg.core.spu_delay_penalty * 1000;
+				}
+				else if (current >= timeout)
+				{
+					break;
+				}
+
+				if (wait == 2)
+				{
+					std::this_thread::yield();
+				}
+				else
+				{
+					busy_wait(50000);
+				}
+			}
+		}
+
 		if (do_putllc(ch_mfc_cmd))
 		{
 			ch_atomic_stat.set_value(MFC_PUTLLC_SUCCESS);
@@ -5299,6 +5526,7 @@ bool spu_thread::process_mfc_cmd()
 			std::memcpy(dump.data, _ptr<u8>(ch_mfc_cmd.lsa & 0x3ff80), 128);
 		}
 
+		downgrade_to_temporary_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this), raddr, last_getllar);
 		static_cast<void>(test_stopped());
 		return true;
 	}
@@ -6180,7 +6408,12 @@ s64 spu_thread::get_ch_value(u32 ch)
 
 			eventstat_busy_waiting_switch = value ? 1 : 0;
 		}
-		
+
+		if (raddr == spurs_addr)
+		{
+			downgrade_to_temporary_atomic_op_info(get_spu_atomic_op_info(this), spu_to_index(this), raddr, last_getllar);
+		}
+
 		for (bool is_first = true; !events.count; events = get_events(mask1 & ~SPU_EVENT_LR, true, true), is_first = false)
 		{
 			const auto old = +state;
diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h
@@ -497,6 +497,12 @@ struct spu_imm_table_t
 
 extern const spu_imm_table_t g_spu_imm;
 
+struct spu_atomic_op_info_for_group
+{
+	u32 addr;
+	u32 getllar;
+};
+
 enum FPSCR_EX
 {
 	//Single-precision exceptions
diff --git a/rpcs3/Emu/Cell/lv2/sys_spu.h b/rpcs3/Emu/Cell/lv2/sys_spu.h
@@ -297,9 +297,10 @@ struct lv2_spu_group
 	bool set_terminate = false;
 
 	std::array<shared_ptr<named_thread<spu_thread>>, 8> threads; // SPU Threads
-	std::array<s8, 256> threads_map; // SPU Threads map based number
-	std::array<std::pair<u32, std::vector<sys_spu_segment>>, 8> imgs; // Entry points, SPU image segments
-	std::array<std::array<u64, 4>, 8> args; // SPU Thread Arguments
+	std::array<s8, 256> threads_map{}; // SPU Threads map based number
+	std::array<std::pair<u32, std::vector<sys_spu_segment>>, 8> imgs{}; // Entry points, SPU image segments
+	std::array<std::array<u64, 4>, 8> args{}; // SPU Thread Arguments
+	std::array<atomic_t<spu_atomic_op_info_for_group>, 8> atomic_ops{};
 
 	shared_ptr<lv2_event_queue> ep_run; // port for SYS_SPU_THREAD_GROUP_EVENT_RUN events
 	shared_ptr<lv2_event_queue> ep_exception; // TODO: SYS_SPU_THREAD_GROUP_EVENT_EXCEPTION