Skip to content

Commit 0e4194d

Browse files
committed
feat: replace LocalCluster with manual node spawning for e2e tests
- Replace LocalCluster with Port-based manual node spawning to fix OTP 28 compatibility - Spawn nodes as separate OS processes using elixir --name - Connect nodes via Node.connect and initialize Ra cluster manually - Update all e2e tests to use new API (ports instead of cluster) - Remove LocalCluster dependency from mix.exs - Enable e2e tests in GitHub Actions workflow (no longer skipped) - Update e2e README to reflect new approach - Fix charlist warning in spawn_node function This fixes the timeout issues with LocalCluster :peer module in OTP 28 while maintaining full multi-node testing capabilities.
1 parent 737b62a commit 0e4194d

File tree

8 files changed

+198
-58
lines changed

8 files changed

+198
-58
lines changed

.github/workflows/e2e-test.yml

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -57,22 +57,10 @@ jobs:
5757
- name: Start EPMD (Erlang Port Mapper Daemon)
5858
run: epmd -daemon
5959

60-
- name: Skip E2E Distributed Tests (LocalCluster/OTP 28 incompatibility)
60+
- name: Run E2E Distributed Tests
6161
run: |
62-
echo "⚠️ Skipping e2e distributed tests due to known LocalCluster/OTP 28 issue"
63-
echo ""
64-
echo "Issue: LocalCluster 2.x uses :peer module which times out when spawning"
65-
echo " child nodes in OTP 28 (timeout after 15 seconds)"
66-
echo ""
67-
echo "Status: Fixed :not_alive error by using --name flag, but still get"
68-
echo " timeout in LocalCluster.start_link/2"
69-
echo ""
70-
echo "Workaround: Manual multi-node testing works (see e2e_test/README.md)"
71-
echo ""
72-
echo "Tracking: Upstream issue with :peer module in OTP 28"
73-
echo ""
74-
echo "✅ Marking as success (tests are skipped, not failing)"
75-
exit 0
62+
echo "Running e2e distributed tests with manual node spawning..."
63+
MIX_ENV=e2e_test elixir --name test@127.0.0.1 --cookie concord_e2e_test -S mix test e2e_test/distributed/ --trace
7664
env:
7765
MIX_ENV: e2e_test
7866

e2e_test/README.md

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,11 @@
22

33
This directory contains end-to-end tests for Concord that verify distributed behavior across multiple nodes in realistic scenarios.
44

5-
## ⚠️ Current Status: OTP 28 Compatibility Issue
5+
## Status: Working with Manual Node Spawning
66

7-
**LocalCluster 2.x is experiencing compatibility issues with Erlang/OTP 28.** The `:peer` module (used internally by LocalCluster) times out when attempting to start child nodes, preventing automated e2e tests from running.
7+
**E2E tests now use manual node spawning via Port instead of LocalCluster** to avoid OTP 28 compatibility issues. Tests spawn actual Erlang VM processes as separate OS processes and connect them via distributed Erlang.
88

9-
**Workaround:** Use manual multi-node testing (see "Manual Testing" section below) until this is resolved.
10-
11-
**Tracking:** This is a known issue with the `:peer` module in OTP 28 that affects multiple distributed testing tools.
9+
**Approach:** Tests use `Port.open` to spawn `elixir --name` nodes, connect them with `Node.connect`, and initialize Ra cluster manually. This provides the same multi-node testing capabilities without the LocalCluster timeout issues.
1210

1311
## Overview
1412

e2e_test/distributed/data_consistency_test.exs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ defmodule Concord.E2E.DataConsistencyTest do
66
@moduletag :distributed
77

88
setup do
9-
{:ok, nodes, cluster} = ClusterHelper.start_cluster(nodes: 3)
9+
{:ok, nodes, ports} = ClusterHelper.start_cluster(nodes: 3)
1010

1111
on_exit(fn ->
12-
ClusterHelper.stop_cluster(cluster)
12+
ClusterHelper.stop_cluster(ports)
1313
end)
1414

15-
%{nodes: nodes, cluster: cluster}
15+
%{nodes: nodes, ports: ports}
1616
end
1717

1818
describe "Data Consistency" do

e2e_test/distributed/leader_election_test.exs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@ defmodule Concord.E2E.LeaderElectionTest do
77

88
setup do
99
# Start a fresh cluster for each test
10-
{:ok, nodes, cluster} = ClusterHelper.start_cluster(nodes: 3)
10+
{:ok, nodes, ports} = ClusterHelper.start_cluster(nodes: 3)
1111

1212
on_exit(fn ->
13-
ClusterHelper.stop_cluster(cluster)
13+
ClusterHelper.stop_cluster(ports)
1414
end)
1515

16-
%{nodes: nodes, cluster: cluster}
16+
%{nodes: nodes, ports: ports}
1717
end
1818

1919
describe "Leader Election" do

e2e_test/distributed/network_partition_test.exs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ defmodule Concord.E2E.NetworkPartitionTest do
66
@moduletag :distributed
77

88
setup do
9-
{:ok, nodes, cluster} = ClusterHelper.start_cluster(nodes: 5)
9+
{:ok, nodes, ports} = ClusterHelper.start_cluster(nodes: 5)
1010

1111
on_exit(fn ->
12-
ClusterHelper.stop_cluster(cluster)
12+
ClusterHelper.stop_cluster(ports)
1313
end)
1414

15-
%{nodes: nodes, cluster: cluster}
15+
%{nodes: nodes, ports: ports}
1616
end
1717

1818
describe "Network Partition" do

e2e_test/distributed/node_failure_test.exs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ defmodule Concord.E2E.NodeFailureTest do
66
@moduletag :distributed
77

88
setup do
9-
{:ok, nodes, cluster} = ClusterHelper.start_cluster(nodes: 3)
9+
{:ok, nodes, ports} = ClusterHelper.start_cluster(nodes: 3)
1010

1111
on_exit(fn ->
12-
ClusterHelper.stop_cluster(cluster)
12+
ClusterHelper.stop_cluster(ports)
1313
end)
1414

15-
%{nodes: nodes, cluster: cluster}
15+
%{nodes: nodes, ports: ports}
1616
end
1717

1818
describe "Node Failure Recovery" do

e2e_test/support/e2e_cluster_helper.ex

Lines changed: 179 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@ defmodule Concord.E2E.ClusterHelper do
22
@moduledoc """
33
Helper module for managing multi-node Concord clusters in e2e tests.
44
5-
Uses LocalCluster to spawn actual Erlang nodes with full network isolation.
5+
Uses manual node spawning via Port to avoid LocalCluster/OTP 28 compatibility issues.
66
"""
77

88
@doc """
9-
Starts a multi-node Concord cluster using LocalCluster.
9+
Starts a multi-node Concord cluster by spawning separate Erlang VMs.
1010
1111
## Options
1212
@@ -16,72 +16,93 @@ defmodule Concord.E2E.ClusterHelper do
1616
1717
## Returns
1818
19-
* `{:ok, nodes}` - List of started node names
19+
* `{:ok, nodes, ports}` - List of started node names and their Port references
2020
* `{:error, reason}` - If cluster failed to start
2121
2222
## Examples
2323
24-
iex> {:ok, nodes} = E2E.ClusterHelper.start_cluster(nodes: 3)
24+
iex> {:ok, nodes, ports} = E2E.ClusterHelper.start_cluster(nodes: 3)
2525
iex> length(nodes)
2626
3
2727
"""
2828
def start_cluster(opts \\ []) do
2929
node_count = Keyword.get(opts, :nodes, 3)
3030
prefix = Keyword.get(opts, :prefix, "concord_e2e")
3131
wait_timeout = Keyword.get(opts, :wait_timeout, 30_000)
32+
cookie = Keyword.get(opts, :cookie, :concord_e2e_test)
3233

3334
IO.puts("Starting #{node_count}-node cluster with prefix '#{prefix}'...")
3435

35-
# Start cluster with LocalCluster 2.x API
36-
# Use empty applications list to prevent auto-start of all apps
37-
{:ok, cluster} =
38-
LocalCluster.start_link(node_count,
39-
prefix: String.to_atom(prefix),
40-
applications: []
41-
)
36+
# Ensure current node is alive (required for distributed Erlang)
37+
ensure_alive()
38+
39+
# Set cookie for cluster communication
40+
Node.set_cookie(cookie)
41+
42+
# Start nodes as separate OS processes
43+
nodes_and_ports =
44+
Enum.map(1..node_count, fn i ->
45+
node_name = :"#{prefix}#{i}@127.0.0.1"
46+
port = spawn_node(node_name, cookie)
47+
{node_name, port}
48+
end)
4249

43-
# Get the node names
44-
{:ok, nodes} = LocalCluster.nodes(cluster)
50+
nodes = Enum.map(nodes_and_ports, fn {node, _port} -> node end)
51+
ports = Enum.map(nodes_and_ports, fn {_node, port} -> port end)
4552

4653
IO.puts("Started nodes: #{inspect(nodes)}")
4754

55+
# Wait for nodes to be reachable
56+
Process.sleep(2000)
57+
58+
# Connect to all nodes
59+
Enum.each(nodes, fn node ->
60+
case Node.connect(node) do
61+
true -> IO.puts("✓ Connected to #{node}")
62+
false -> IO.puts("✗ Failed to connect to #{node}")
63+
:ignored -> IO.puts("⚠ Already connected to #{node}")
64+
end
65+
end)
66+
67+
# Wait for connections to stabilize
68+
Process.sleep(1000)
69+
4870
# Initialize Concord on each node
4971
Enum.each(nodes, fn node ->
5072
IO.puts("Initializing Concord on #{node}...")
5173
:rpc.call(node, Application, :ensure_all_started, [:concord])
5274
end)
5375

76+
# Initialize Ra cluster on all nodes with full member list
77+
initialize_ra_cluster(nodes)
78+
5479
# Wait for cluster formation
5580
case wait_for_cluster_ready(nodes, wait_timeout) do
5681
:ok ->
5782
IO.puts("✓ Cluster ready with #{length(nodes)} nodes")
58-
{:ok, nodes, cluster}
83+
{:ok, nodes, ports}
5984

6085
{:error, reason} ->
6186
IO.puts("✗ Cluster failed to start: #{inspect(reason)}")
62-
LocalCluster.stop(cluster)
87+
stop_cluster(ports)
6388
{:error, reason}
6489
end
6590
end
6691

6792
@doc """
6893
Stops a running cluster and cleans up resources.
6994
"""
70-
def stop_cluster(cluster) do
71-
{:ok, nodes} = LocalCluster.nodes(cluster)
72-
IO.puts("Stopping cluster nodes: #{inspect(nodes)}")
95+
def stop_cluster(ports) when is_list(ports) do
96+
IO.puts("Stopping cluster nodes...")
7397

74-
# Stop Concord application on each node first
75-
Enum.each(nodes, fn node ->
76-
:rpc.call(node, Application, :stop, [:concord])
98+
# Close all ports (kills the node processes)
99+
Enum.each(ports, fn port ->
100+
Port.close(port)
77101
end)
78102

79103
# Give time for graceful shutdown
80104
Process.sleep(500)
81105

82-
# Stop the cluster
83-
LocalCluster.stop(cluster)
84-
85106
# Clean up data directories
86107
cleanup_data_dirs()
87108

@@ -185,6 +206,140 @@ defmodule Concord.E2E.ClusterHelper do
185206

186207
# Private functions
187208

209+
defp ensure_alive do
210+
case Node.alive?() do
211+
true ->
212+
:ok
213+
214+
false ->
215+
# Current node needs to be alive for distributed Erlang
216+
# This should already be set by the test runner with --name flag
217+
raise """
218+
Current node is not alive! E2E tests require distributed Erlang.
219+
Run tests with: elixir --name test@127.0.0.1 --cookie test -S mix test e2e_test/
220+
"""
221+
end
222+
end
223+
224+
defp spawn_node(node_name, cookie) do
225+
# Get the current working directory and Mix environment
226+
cwd = File.cwd!()
227+
mix_env = "e2e_test"
228+
229+
# Build the command to start an iex node
230+
# Use detached mode to run in background
231+
cmd = "elixir"
232+
233+
args = [
234+
"--name",
235+
to_string(node_name),
236+
"--cookie",
237+
to_string(cookie),
238+
"--no-halt",
239+
"--erl",
240+
"-kernel inet_dist_listen_min 9100 inet_dist_listen_max 9199",
241+
"-S",
242+
"mix",
243+
"run",
244+
"--no-start"
245+
]
246+
247+
# Spawn the node as a separate OS process
248+
port =
249+
Port.open(
250+
{:spawn_executable, System.find_executable(cmd)},
251+
[
252+
:binary,
253+
:exit_status,
254+
{:args, args},
255+
{:cd, cwd},
256+
{:env, [{~c"MIX_ENV", String.to_charlist(mix_env)}]},
257+
{:line, 1024}
258+
]
259+
)
260+
261+
IO.puts("Spawned node #{node_name} via Port #{inspect(port)}")
262+
port
263+
end
264+
265+
defp initialize_ra_cluster(nodes) do
266+
cluster_name = :concord_cluster
267+
data_dir_base = "./data/e2e_test"
268+
269+
# Build server IDs for all nodes
270+
server_ids = Enum.map(nodes, &{cluster_name, &1})
271+
272+
IO.puts("Initializing Ra cluster with members: #{inspect(server_ids)}")
273+
274+
# Initialize Ra cluster on each node
275+
Enum.each(nodes, fn node ->
276+
node_id = {cluster_name, node}
277+
uid = node_id |> Tuple.to_list() |> Enum.join("_") |> String.replace("@", "_")
278+
data_dir = "#{data_dir_base}/#{node}"
279+
280+
# Stop existing Ra server if running
281+
case :rpc.call(node, :ra, :stop_server, [node_id]) do
282+
:ok ->
283+
IO.puts("✓ Stopped existing Ra server on #{node}")
284+
285+
{:error, :not_started} ->
286+
IO.puts("⚠ Ra server was not started on #{node}")
287+
288+
{:error, :system_not_started} ->
289+
IO.puts("⚠ Ra system not started on #{node}")
290+
291+
{:badrpc, _} = error ->
292+
IO.puts("⚠ RPC error stopping Ra on #{node}: #{inspect(error)}")
293+
294+
error ->
295+
IO.puts("⚠ Stop server on #{node} returned: #{inspect(error)}")
296+
end
297+
298+
# Wait for shutdown
299+
Process.sleep(500)
300+
301+
# Start Ra server with all members
302+
server_config = %{
303+
id: node_id,
304+
uid: uid,
305+
cluster_name: cluster_name,
306+
machine: {:module, Concord.StateMachine, %{}},
307+
log_init_args: %{
308+
uid: uid,
309+
data_dir: data_dir
310+
},
311+
initial_members: server_ids
312+
}
313+
314+
case :rpc.call(node, :ra, :start_server, [server_config]) do
315+
:ok ->
316+
IO.puts("✓ Ra server started on #{node}")
317+
:rpc.call(node, :ra, :trigger_election, [node_id])
318+
:ok
319+
320+
{:ok, _} ->
321+
IO.puts("✓ Ra server started on #{node}")
322+
:ok
323+
324+
{:error, {:already_started, _}} ->
325+
IO.puts("✓ Ra server already running on #{node}")
326+
:ok
327+
328+
{:error, reason} ->
329+
IO.puts("✗ Failed to start Ra server on #{node}: #{inspect(reason)}")
330+
{:error, reason}
331+
332+
{:badrpc, reason} ->
333+
IO.puts("✗ RPC error starting Ra on #{node}: #{inspect(reason)}")
334+
{:error, {:badrpc, reason}}
335+
end
336+
end)
337+
338+
# Wait for Ra cluster to initialize
339+
Process.sleep(3000)
340+
:ok
341+
end
342+
188343
defp wait_for_cluster_ready(nodes, timeout) do
189344
start_time = System.monotonic_time(:millisecond)
190345
until = start_time + timeout

mix.exs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,7 @@ defmodule Concord.MixProject do
113113
{:opentelemetry_telemetry, "~> 1.1"},
114114
# Event streaming with GenStage
115115
{:gen_stage, "~> 1.2"},
116-
# E2E testing
117-
{:local_cluster, "~> 2.0", only: [:e2e_test], runtime: false},
116+
# E2E testing (note: LocalCluster removed due to OTP 28 compatibility, using manual node spawning)
118117
{:httpoison, "~> 2.0", only: [:e2e_test], runtime: false},
119118
{:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false},
120119
{:credo, "~> 1.7", only: [:dev, :test], runtime: false},

0 commit comments

Comments
 (0)