Skip to content

Commit a8a163e

Browse files
authored
fix: change Connect and ReplicationConnection to be temporary (#1495)
We don't want them to be transient as a database connection might have issues and the supervisor will keep trying to connect but if the database is completely inaccessible the supervisor will reach max restarts and causing the whole partition supervisor to crash. Connect.lookup_or_start_connection ensures that the connection is back up when needed so we don't need to worry about restarting theses processes straightaway
1 parent 6a0ee73 commit a8a163e

File tree

5 files changed

+40
-3
lines changed

5 files changed

+40
-3
lines changed

lib/realtime/tenants/connect.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ defmodule Realtime.Tenants.Connect do
77
* `:check_connect_region_interval` - The interval in milliseconds to check if this process is in the correct region. If the region is not correct it stops the connection.
88
* `:erpc_timeout` - The timeout in milliseconds for the `:erpc` calls to the tenant's database.
99
"""
10-
use GenServer, restart: :transient
10+
use GenServer, restart: :temporary
1111

1212
use Realtime.Logs
1313

lib/realtime/tenants/replication_connection.ex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ defmodule Realtime.Tenants.ReplicationConnection do
9797
child_spec = %{
9898
id: __MODULE__,
9999
start: {Wrapper, :start_link, [opts, init_timeout]},
100-
restart: :transient,
100+
restart: :temporary,
101101
type: :worker
102102
}
103103

mix.exs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ defmodule Realtime.MixProject do
44
def project do
55
[
66
app: :realtime,
7-
version: "2.42.0",
7+
version: "2.42.1",
88
elixir: "~> 1.17.3",
99
elixirc_paths: elixirc_paths(Mix.env()),
1010
start_permanent: Mix.env() == :prod,

test/realtime/tenants/connect_test.exs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,21 @@ defmodule Realtime.Tenants.ConnectTest do
3535
refute_receive {:DOWN, ^ref, :process, ^pid, _reason}, timeout
3636
end
3737

38+
describe "temporary process" do
39+
test "starts a temporary process", %{tenant: tenant} do
40+
assert {:ok, _} = Connect.lookup_or_start_connection(tenant.external_id)
41+
pid = Connect.whereis(tenant.external_id)
42+
# Brutally kill the process
43+
Process.exit(pid, :kill)
44+
assert_process_down(pid)
45+
# Wait to ensure that the process has not restarted
46+
Process.sleep(1000)
47+
48+
# Temporary process should not be registered in syn
49+
refute Connect.whereis(tenant.external_id)
50+
end
51+
end
52+
3853
describe "handle cold start" do
3954
test "multiple proccesses succeed together", %{tenant: tenant} do
4055
parent = self()

test/realtime/tenants/replication_connection_test.exs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,23 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do
2727
%{tenant: tenant}
2828
end
2929

30+
describe "temporary process" do
31+
test "starts a temporary process", %{tenant: tenant} do
32+
assert {:ok, pid} = ReplicationConnection.start(tenant, self())
33+
assert conn = ReplicationConnection.whereis(tenant.external_id)
34+
35+
# Brutally kill the process
36+
Process.exit(pid, :kill)
37+
assert_process_down(pid)
38+
assert_process_down(conn)
39+
# Wait to ensure that the process has not restarted
40+
Process.sleep(1000)
41+
42+
# Temporary process should not be registered
43+
refute ReplicationConnection.whereis(tenant.external_id)
44+
end
45+
end
46+
3047
describe "replication" do
3148
test "fails if tenant connection is invalid" do
3249
tenant =
@@ -387,4 +404,9 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do
387404

388405
Endpoint.subscribe(tenant_topic, metadata: fastlane)
389406
end
407+
408+
defp assert_process_down(pid, timeout \\ 100) do
409+
ref = Process.monitor(pid)
410+
assert_receive {:DOWN, ^ref, :process, ^pid, _reason}, timeout
411+
end
390412
end

0 commit comments

Comments
 (0)