From 63a60d2f35d9453c7f4c7855a2871ccbb0f82228 Mon Sep 17 00:00:00 2001 From: Ben Nguyen Date: Fri, 12 Sep 2025 02:21:08 -0700 Subject: [PATCH 1/2] AWS peer discovery: ensure consistent hostname path ordering AWS EC2 API returns networkInterfaceSet and privateIpAddressesSet in arbitrary order, causing non-deterministic hostname resolution during peer discovery. This leads to inconsistent cluster formation. Changes: - Sort network interfaces by deviceIndex (0 first for primary ENI) - Sort private IP addresses by primary flag (primary=true first) - Add debug logging to show hostname path selection and sorting results - Add comprehensive unit tests for sorting behavior The sorting ensures deviceIndex=0 and primary=true IPs are consistently selected first, making peer discovery deterministic across deployments. --- .../src/rabbit_peer_discovery_aws.erl | 75 +++++++++- .../test/unit_SUITE.erl | 128 ++++++++++++++++-- 2 files changed, 190 insertions(+), 13 deletions(-) diff --git a/deps/rabbitmq_peer_discovery_aws/src/rabbit_peer_discovery_aws.erl b/deps/rabbitmq_peer_discovery_aws/src/rabbit_peer_discovery_aws.erl index ad1617aef9a7..a3ab65577999 100644 --- a/deps/rabbitmq_peer_discovery_aws/src/rabbit_peer_discovery_aws.erl +++ b/deps/rabbitmq_peer_discovery_aws/src/rabbit_peer_discovery_aws.erl @@ -351,10 +351,12 @@ get_hostname_by_tags(Tags) -> get_hostname_path() -> UsePrivateIP = get_config_key(aws_use_private_ip, ?CONFIG_MODULE:config_map(?BACKEND_CONFIG_KEY)), HostnamePath = get_config_key(aws_hostname_path, ?CONFIG_MODULE:config_map(?BACKEND_CONFIG_KEY)), - case HostnamePath of + FinalPath = case HostnamePath of ["privateDnsName"] when UsePrivateIP -> ["privateIpAddress"]; P -> P - end. + end, + ?LOG_DEBUG("AWS peer discovery using hostname path: ~tp", [FinalPath]), + FinalPath. -spec get_hostname(path(), props()) -> string(). get_hostname(Path, Props) -> @@ -370,9 +372,78 @@ get_value(_, []) -> get_value(Key, Props) when is_integer(Key) -> {"item", Props2} = lists:nth(Key, Props), Props2; +get_value("networkInterfaceSet", Props) -> + NetworkInterfaces = proplists:get_value("networkInterfaceSet", Props), + sort_network_interfaces_by_device_index(NetworkInterfaces); +get_value("privateIpAddressesSet", Props) -> + PrivateIpAddresses = proplists:get_value("privateIpAddressesSet", Props), + sort_private_ip_addresses_by_primary(PrivateIpAddresses); get_value(Key, Props) -> proplists:get_value(Key, Props). +%% Sort network interfaces by deviceIndex to ensure consistent ENI ordering +-spec sort_network_interfaces_by_device_index(list()) -> list(). +sort_network_interfaces_by_device_index(NetworkInterfaces) when is_list(NetworkInterfaces) -> + BeforeInfo = [format_network_interface_info(Props) || {"item", Props} <- NetworkInterfaces], + Sorted = lists:sort(fun({"item", A}, {"item", B}) -> + device_index(A) =< device_index(B) + end, NetworkInterfaces), + AfterInfo = [format_network_interface_info(Props) || {"item", Props} <- Sorted], + ?LOG_DEBUG("AWS peer discovery sorted network interfaces from ~tp to ~tp", [BeforeInfo, AfterInfo]), + Sorted; +sort_network_interfaces_by_device_index(Other) -> + Other. + +%% Sort private IP addresses by primary flag to ensure primary=true comes first +-spec sort_private_ip_addresses_by_primary(list()) -> list(). +sort_private_ip_addresses_by_primary(PrivateIpAddresses) when is_list(PrivateIpAddresses) -> + BeforeInfo = [format_private_ip_info(Props) || {"item", Props} <- PrivateIpAddresses], + Sorted = lists:sort(fun({"item", A}, {"item", B}) -> + is_primary(A) >= is_primary(B) + end, PrivateIpAddresses), + AfterInfo = [format_private_ip_info(Props) || {"item", Props} <- Sorted], + ?LOG_DEBUG("AWS peer discovery sorted private IPs from ~tp to ~tp", [BeforeInfo, AfterInfo]), + Sorted; +sort_private_ip_addresses_by_primary(Other) -> + Other. + +%% Extract deviceIndex from network interface attachment +-spec device_index(props()) -> integer(). +device_index(Interface) -> + Attachment = proplists:get_value("attachment", Interface), + case proplists:get_value("deviceIndex", Attachment) of + DeviceIndex when is_list(DeviceIndex) -> + {Int, []} = string:to_integer(DeviceIndex), + Int; + DeviceIndex when is_integer(DeviceIndex) -> + DeviceIndex + end. + +%% Extract primary flag from private IP address +-spec is_primary(props()) -> boolean(). +is_primary(IpAddress) -> + case proplists:get_value("primary", IpAddress) of + "true" -> true; + _ -> false + end. + +%% Format network interface info for logging +-spec format_network_interface_info(props()) -> string(). +format_network_interface_info(Interface) -> + ENI = proplists:get_value("networkInterfaceId", Interface, "unknown"), + DeviceIndex = device_index(Interface), + lists:flatten(io_lib:format("~s:~w", [ENI, DeviceIndex])). + +%% Format private IP info for logging +-spec format_private_ip_info(props()) -> string(). +format_private_ip_info(IpAddress) -> + IP = proplists:get_value("privateIpAddress", IpAddress, "unknown"), + Primary = case is_primary(IpAddress) of + true -> "primary"; + false -> "secondary" + end, + lists:flatten(io_lib:format("~s:~s", [IP, Primary])). + -spec get_tags() -> tags(). get_tags() -> Tags = get_config_key(aws_ec2_tags, ?CONFIG_MODULE:config_map(?BACKEND_CONFIG_KEY)), diff --git a/deps/rabbitmq_peer_discovery_aws/test/unit_SUITE.erl b/deps/rabbitmq_peer_discovery_aws/test/unit_SUITE.erl index a2c244610d81..4a99e58609a6 100644 --- a/deps/rabbitmq_peer_discovery_aws/test/unit_SUITE.erl +++ b/deps/rabbitmq_peer_discovery_aws/test/unit_SUITE.erl @@ -23,7 +23,9 @@ groups() -> {unit, [], [ maybe_add_tag_filters, get_hostname_name_from_reservation_set, - registration_support + registration_support, + network_interface_sorting, + private_ip_address_sorting ]}, {lock, [], [ lock_single_node, @@ -75,12 +77,93 @@ get_hostname_name_from_reservation_set(_Config) -> ?assertEqual(Expectation, rabbit_peer_discovery_aws:get_hostname_name_from_reservation_set( reservation_set(), [])) + end}, + {"from private IP DNS in network interface", + fun() -> + os:putenv("AWS_HOSTNAME_PATH", "networkInterfaceSet,2,privateIpAddressesSet,1,privateDnsName"), + Expectation = ["ip-10-0-15-100.eu-west-1.compute.internal", + "ip-10-0-16-31.eu-west-1.compute.internal"], + ?assertEqual(Expectation, + rabbit_peer_discovery_aws:get_hostname_name_from_reservation_set( + reservation_set(), [])) end}] }). registration_support(_Config) -> ?assertEqual(false, rabbit_peer_discovery_aws:supports_registration()). +network_interface_sorting(_Config) -> + %% Test ENI sorting by deviceIndex (DescribeInstances only returns attached ENIs) + NetworkInterfaces = [ + {"item", [ + {"networkInterfaceId", "eni-secondary"}, + {"attachment", [{"deviceIndex", "1"}]} + ]}, + {"item", [ + {"networkInterfaceId", "eni-primary"}, + {"attachment", [{"deviceIndex", "0"}]} + ]}, + {"item", [ + {"networkInterfaceId", "eni-tertiary"}, + {"attachment", [{"deviceIndex", "2"}]} + ]} + ], + + %% Should sort ENIs by deviceIndex + Sorted = rabbit_peer_discovery_aws:sort_network_interfaces_by_device_index(NetworkInterfaces), + + %% Should have all 3 ENIs + ?assertEqual(3, length(Sorted)), + + %% Primary ENI (deviceIndex=0) should be first + {"item", FirstENI} = lists:nth(1, Sorted), + ?assertEqual("eni-primary", proplists:get_value("networkInterfaceId", FirstENI)), + + %% Secondary ENI (deviceIndex=1) should be second + {"item", SecondENI} = lists:nth(2, Sorted), + ?assertEqual("eni-secondary", proplists:get_value("networkInterfaceId", SecondENI)), + + %% Tertiary ENI (deviceIndex=2) should be third + {"item", ThirdENI} = lists:nth(3, Sorted), + ?assertEqual("eni-tertiary", proplists:get_value("networkInterfaceId", ThirdENI)). + +private_ip_address_sorting(_Config) -> + %% Test private IP address sorting by primary flag + PrivateIpAddresses = [ + {"item", [ + {"privateIpAddress", "10.0.14.176"}, + {"privateDnsName", "ip-10-0-14-176.us-west-2.compute.internal"}, + {"primary", "false"} + ]}, + {"item", [ + {"privateIpAddress", "10.0.12.112"}, + {"privateDnsName", "ip-10-0-12-112.us-west-2.compute.internal"}, + {"primary", "true"} + ]}, + {"item", [ + {"privateIpAddress", "10.0.15.200"}, + {"privateDnsName", "ip-10-0-15-200.us-west-2.compute.internal"}, + {"primary", "false"} + ]} + ], + + Sorted = rabbit_peer_discovery_aws:sort_private_ip_addresses_by_primary(PrivateIpAddresses), + ?assertEqual(3, length(Sorted)), + + %% Primary IP (primary=true) should be first + {"item", FirstIP} = lists:nth(1, Sorted), + ?assertEqual("10.0.12.112", proplists:get_value("privateIpAddress", FirstIP)), + ?assertEqual("true", proplists:get_value("primary", FirstIP)), + + %% Non-primary IPs should maintain relative order + {"item", SecondIP} = lists:nth(2, Sorted), + ?assertEqual("10.0.14.176", proplists:get_value("privateIpAddress", SecondIP)), + ?assertEqual("false", proplists:get_value("primary", SecondIP)), + + {"item", ThirdIP} = lists:nth(3, Sorted), + ?assertEqual("10.0.15.200", proplists:get_value("privateIpAddress", ThirdIP)), + ?assertEqual("false", proplists:get_value("primary", ThirdIP)). + lock_single_node(_Config) -> LocalNode = node(), Nodes = [LocalNode], @@ -141,16 +224,30 @@ reservation_set() -> {"vpcId","vpc-4fe1562b"}, {"networkInterfaceSet", [ {"item", - [{"association", - [{"publicIp","203.0.113.11"}, - {"publicDnsName", - "ec2-203-0-113-11.eu-west-1.compute.amazonaws.com"}, - {"ipOwnerId","amazon"}]}]}, - {"item", - [{"association", + [{"attachment", [{"deviceIndex", "1"}]}, + {"association", [{"publicIp","203.0.113.12"}, {"publicDnsName", "ec2-203-0-113-12.eu-west-1.compute.amazonaws.com"}, + {"ipOwnerId","amazon"}]}, + {"privateIpAddressesSet", [ + {"item", [ + {"privateIpAddress", "10.0.15.101"}, + {"privateDnsName", "ip-10-0-15-101.eu-west-1.compute.internal"}, + {"primary", "false"} + ]}, + {"item", [ + {"privateIpAddress", "10.0.15.100"}, + {"privateDnsName", "ip-10-0-15-100.eu-west-1.compute.internal"}, + {"primary", "true"} + ]} + ]}]}, + {"item", + [{"attachment", [{"deviceIndex", "0"}]}, + {"association", + [{"publicIp","203.0.113.11"}, + {"publicDnsName", + "ec2-203-0-113-11.eu-west-1.compute.amazonaws.com"}, {"ipOwnerId","amazon"}]}]}]}, {"privateIpAddress","10.0.16.29"}]}]}]}, {"item", [{"reservationId","r-006cfdbf8d04c5f01"}, @@ -171,15 +268,24 @@ reservation_set() -> {"vpcId","vpc-4fe1562b"}, {"networkInterfaceSet", [ {"item", - [{"association", + [{"attachment", [{"deviceIndex", "0"}]}, + {"association", [{"publicIp","203.0.113.21"}, {"publicDnsName", "ec2-203-0-113-21.eu-west-1.compute.amazonaws.com"}, {"ipOwnerId","amazon"}]}]}, {"item", - [{"association", + [{"attachment", [{"deviceIndex", "1"}]}, + {"association", [{"publicIp","203.0.113.22"}, {"publicDnsName", "ec2-203-0-113-22.eu-west-1.compute.amazonaws.com"}, - {"ipOwnerId","amazon"}]}]}]}, + {"ipOwnerId","amazon"}]}, + {"privateIpAddressesSet", [ + {"item", [ + {"privateIpAddress", "10.0.16.31"}, + {"privateDnsName", "ip-10-0-16-31.eu-west-1.compute.internal"}, + {"primary", "true"} + ]} + ]}]}]}, {"privateIpAddress","10.0.16.31"}]}]}]}]. From 7cebf8965fc0e60a5d6074e545727b18cd71ece0 Mon Sep 17 00:00:00 2001 From: Ben Nguyen Date: Tue, 16 Sep 2025 18:02:45 -0700 Subject: [PATCH 2/2] AWS peer discovery: ensure consistent hostname path ordering (address feedback on debug logs and sorting helper functions) --- .../src/rabbit_peer_discovery_aws.erl | 61 ++++--------------- .../test/unit_SUITE.erl | 4 +- 2 files changed, 13 insertions(+), 52 deletions(-) diff --git a/deps/rabbitmq_peer_discovery_aws/src/rabbit_peer_discovery_aws.erl b/deps/rabbitmq_peer_discovery_aws/src/rabbit_peer_discovery_aws.erl index a3ab65577999..c4c0da98a2a8 100644 --- a/deps/rabbitmq_peer_discovery_aws/src/rabbit_peer_discovery_aws.erl +++ b/deps/rabbitmq_peer_discovery_aws/src/rabbit_peer_discovery_aws.erl @@ -372,40 +372,18 @@ get_value(_, []) -> get_value(Key, Props) when is_integer(Key) -> {"item", Props2} = lists:nth(Key, Props), Props2; -get_value("networkInterfaceSet", Props) -> - NetworkInterfaces = proplists:get_value("networkInterfaceSet", Props), - sort_network_interfaces_by_device_index(NetworkInterfaces); -get_value("privateIpAddressesSet", Props) -> - PrivateIpAddresses = proplists:get_value("privateIpAddressesSet", Props), - sort_private_ip_addresses_by_primary(PrivateIpAddresses); get_value(Key, Props) -> - proplists:get_value(Key, Props). - -%% Sort network interfaces by deviceIndex to ensure consistent ENI ordering --spec sort_network_interfaces_by_device_index(list()) -> list(). -sort_network_interfaces_by_device_index(NetworkInterfaces) when is_list(NetworkInterfaces) -> - BeforeInfo = [format_network_interface_info(Props) || {"item", Props} <- NetworkInterfaces], - Sorted = lists:sort(fun({"item", A}, {"item", B}) -> - device_index(A) =< device_index(B) - end, NetworkInterfaces), - AfterInfo = [format_network_interface_info(Props) || {"item", Props} <- Sorted], - ?LOG_DEBUG("AWS peer discovery sorted network interfaces from ~tp to ~tp", [BeforeInfo, AfterInfo]), - Sorted; -sort_network_interfaces_by_device_index(Other) -> - Other. - -%% Sort private IP addresses by primary flag to ensure primary=true comes first --spec sort_private_ip_addresses_by_primary(list()) -> list(). -sort_private_ip_addresses_by_primary(PrivateIpAddresses) when is_list(PrivateIpAddresses) -> - BeforeInfo = [format_private_ip_info(Props) || {"item", Props} <- PrivateIpAddresses], - Sorted = lists:sort(fun({"item", A}, {"item", B}) -> - is_primary(A) >= is_primary(B) - end, PrivateIpAddresses), - AfterInfo = [format_private_ip_info(Props) || {"item", Props} <- Sorted], - ?LOG_DEBUG("AWS peer discovery sorted private IPs from ~tp to ~tp", [BeforeInfo, AfterInfo]), - Sorted; -sort_private_ip_addresses_by_primary(Other) -> - Other. + Value = proplists:get_value(Key, Props), + sort_ec2_hostname_path_set_members(Key, Value). + +%% Sort AWS API responses for consistent ordering +-spec sort_ec2_hostname_path_set_members(string(), any()) -> any(). +sort_ec2_hostname_path_set_members("networkInterfaceSet", NetworkInterfaces) when is_list(NetworkInterfaces) -> + lists:sort(fun({"item", A}, {"item", B}) -> device_index(A) =< device_index(B) end, NetworkInterfaces); +sort_ec2_hostname_path_set_members("privateIpAddressesSet", PrivateIpAddresses) when is_list(PrivateIpAddresses) -> + lists:sort(fun({"item", A}, {"item", B}) -> is_primary(A) >= is_primary(B) end, PrivateIpAddresses); +sort_ec2_hostname_path_set_members(_, Value) -> + Value. %% Extract deviceIndex from network interface attachment -spec device_index(props()) -> integer(). @@ -427,23 +405,6 @@ is_primary(IpAddress) -> _ -> false end. -%% Format network interface info for logging --spec format_network_interface_info(props()) -> string(). -format_network_interface_info(Interface) -> - ENI = proplists:get_value("networkInterfaceId", Interface, "unknown"), - DeviceIndex = device_index(Interface), - lists:flatten(io_lib:format("~s:~w", [ENI, DeviceIndex])). - -%% Format private IP info for logging --spec format_private_ip_info(props()) -> string(). -format_private_ip_info(IpAddress) -> - IP = proplists:get_value("privateIpAddress", IpAddress, "unknown"), - Primary = case is_primary(IpAddress) of - true -> "primary"; - false -> "secondary" - end, - lists:flatten(io_lib:format("~s:~s", [IP, Primary])). - -spec get_tags() -> tags(). get_tags() -> Tags = get_config_key(aws_ec2_tags, ?CONFIG_MODULE:config_map(?BACKEND_CONFIG_KEY)), diff --git a/deps/rabbitmq_peer_discovery_aws/test/unit_SUITE.erl b/deps/rabbitmq_peer_discovery_aws/test/unit_SUITE.erl index 4a99e58609a6..c705ef3853d1 100644 --- a/deps/rabbitmq_peer_discovery_aws/test/unit_SUITE.erl +++ b/deps/rabbitmq_peer_discovery_aws/test/unit_SUITE.erl @@ -110,7 +110,7 @@ network_interface_sorting(_Config) -> ], %% Should sort ENIs by deviceIndex - Sorted = rabbit_peer_discovery_aws:sort_network_interfaces_by_device_index(NetworkInterfaces), + Sorted = rabbit_peer_discovery_aws:sort_ec2_hostname_path_set_members("networkInterfaceSet", NetworkInterfaces), %% Should have all 3 ENIs ?assertEqual(3, length(Sorted)), @@ -147,7 +147,7 @@ private_ip_address_sorting(_Config) -> ]} ], - Sorted = rabbit_peer_discovery_aws:sort_private_ip_addresses_by_primary(PrivateIpAddresses), + Sorted = rabbit_peer_discovery_aws:sort_ec2_hostname_path_set_members("privateIpAddressesSet", PrivateIpAddresses), ?assertEqual(3, length(Sorted)), %% Primary IP (primary=true) should be first