From 19e7f1d7fba3995256dbc4efd80378e33ad29848 Mon Sep 17 00:00:00 2001 From: "Al @h0lybyte" <5599058+h0lybyte@users.noreply.github.com> Date: Tue, 2 Sep 2025 18:15:32 -0400 Subject: [PATCH 1/8] =?UTF-8?q?=F0=9F=94=84=20Sync=20with=20upstream=20cha?= =?UTF-8?q?nges=20(#2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: fix couple of flaky tests (#1517) * fix: Improve runtime setup logic (#1511) Cleanup runtime.exs logic to be more organized and easier to mantain * fix: runtime setup error (#1520) --------- Co-authored-by: Eduardo Gurgel Co-authored-by: Filipe Cabaço --- .gitignore | 6 +- config/runtime.exs | 370 ++++++++---------- lib/realtime/application.ex | 21 +- lib/realtime/nodes.ex | 27 -- mix.exs | 2 +- run.sh | 2 +- .../monitoring/distributed_metrics_test.exs | 23 -- .../rate_counter/rate_counter_test.exs | 5 +- 8 files changed, 185 insertions(+), 271 deletions(-) diff --git a/.gitignore b/.gitignore index a96970546..fec4b85ab 100644 --- a/.gitignore +++ b/.gitignore @@ -27,13 +27,9 @@ realtime-*.tar # Ignore Dialyzer .plt /priv/plts/* - node_modules - .supabase - config/prod.secret.exs - demo/.env - .lexical +.vscode \ No newline at end of file diff --git a/config/runtime.exs b/config/runtime.exs index 99d524e54..ac0a2569b 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -1,9 +1,25 @@ import Config -config :logflare_logger_backend, - url: System.get_env("LOGFLARE_LOGGER_BACKEND_URL", "https://api.logflare.app") +defmodule Env do + def get_integer(env, default) do + value = System.get_env(env) + if value, do: String.to_integer(value), else: default + end + + def get_charlist(env, default) do + value = System.get_env(env) + if value, do: String.to_charlist(value), else: default + end + + def get_boolean(env, default) do + value = System.get_env(env) + if value, do: String.to_existing_atom(value), else: default + end +end app_name = System.get_env("APP_NAME", "") + +# Setup Database default_db_host = System.get_env("DB_HOST", "127.0.0.1") username = System.get_env("DB_USER", "postgres") password = System.get_env("DB_PASSWORD", "postgres") @@ -11,62 +27,51 @@ database = System.get_env("DB_NAME", "postgres") port = System.get_env("DB_PORT", "5432") db_version = System.get_env("DB_IP_VERSION") slot_name_suffix = System.get_env("SLOT_NAME_SUFFIX") +db_ssl_enabled? = Env.get_boolean("DB_SSL", false) +db_ssl_ca_cert = System.get_env("DB_SSL_CA_CERT") +queue_target = Env.get_integer("DB_QUEUE_TARGET", 5000) +queue_interval = Env.get_integer("DB_QUEUE_INTERVAL", 5000) +pool_size = Env.get_integer("DB_POOL_SIZE", 5) + +after_connect_query_args = + case System.get_env("DB_AFTER_CONNECT_QUERY") do + nil -> nil + query -> {Postgrex, :query!, [query, []]} + end ssl_opts = - if System.get_env("DB_SSL", "false") == "true" do - if cert = System.get_env("DB_SSL_CA_CERT") do - [cacertfile: cert] - else - [verify: :verify_none] - end - else - false + cond do + db_ssl_enabled? and is_binary(db_ssl_ca_cert) -> [cacertfile: db_ssl_ca_cert] + db_ssl_enabled? -> [verify: :verify_none] + true -> false end -tenant_cache_expiration = - System.get_env("TENANT_CACHE_EXPIRATION_IN_MS", "30000") |> String.to_integer() - -migration_partition_slots = - System.get_env("MIGRATION_PARTITION_SLOTS", "#{System.schedulers_online() * 2}") |> String.to_integer() - -connect_partition_slots = - System.get_env("CONNECT_PARTITION_SLOTS", "#{System.schedulers_online() * 2}") |> String.to_integer() - -# defaults to 30 minutes -metrics_cleaner_schedule_timer_in_ms = - System.get_env("METRICS_CLEANER_SCHEDULE_TIMER_IN_MS", "1800000") |> String.to_integer() - -metrics_rpc_timeout_in_ms = - System.get_env("METRICS_RPC_TIMEOUT_IN_MS", "15000") |> String.to_integer() - -rebalance_check_interval_in_ms = - System.get_env("REBALANCE_CHECK_INTERVAL_IN_MS", to_string(:timer.minutes(10))) |> String.to_integer() - -disconnect_socket_on_no_channels_interval_in_ms = - System.get_env("DISCONNECT_SOCKET_ON_NO_CHANNELS_INTERVAL_IN_MS", "30000") |> String.to_integer() - -tenant_max_bytes_per_second = System.get_env("TENANT_MAX_BYTES_PER_SECOND", "100000") |> String.to_integer() -tenant_max_channels_per_client = System.get_env("TENANT_MAX_CHANNELS_PER_CLIENT", "100") |> String.to_integer() -tenant_max_concurrent_users = System.get_env("TENANT_MAX_CONCURRENT_USERS", "200") |> String.to_integer() -tenant_max_events_per_second = System.get_env("TENANT_MAX_EVENTS_PER_SECOND", "100") |> String.to_integer() -tenant_max_joins_per_second = System.get_env("TENANT_MAX_JOINS_PER_SECOND", "100") |> String.to_integer() -rpc_timeout = System.get_env("RPC_TIMEOUT", "30000") |> String.to_integer() -max_gen_rpc_clients = System.get_env("MAX_GEN_RPC_CLIENTS", "5") |> String.to_integer() -run_janitor? = System.get_env("RUN_JANITOR", "false") == "true" -janitor_schedule_randomize = System.get_env("JANITOR_SCHEDULE_RANDOMIZE", "true") == "true" -janitor_max_children = System.get_env("JANITOR_MAX_CHILDREN", "5") |> String.to_integer() -janitor_chunk_size = System.get_env("JANITOR_CHUNK_SIZE", "10") |> String.to_integer() -# defaults to 10 minutes -janitor_run_after_in_ms = System.get_env("JANITOR_RUN_AFTER_IN_MS", "600000") |> String.to_integer() -# defaults to 5 seconds -janitor_children_timeout = System.get_env("JANITOR_CHILDREN_TIMEOUT", "5000") |> String.to_integer() -# 4 hours by default -janitor_schedule_timer = System.get_env("JANITOR_SCHEDULE_TIMER_IN_MS", "14400000") |> String.to_integer() -# defaults to 10 minutes +tenant_cache_expiration = Env.get_integer("TENANT_CACHE_EXPIRATION_IN_MS", :timer.seconds(30)) +migration_partition_slots = Env.get_integer("MIGRATION_PARTITION_SLOTS", System.schedulers_online() * 2) +connect_partition_slots = Env.get_integer("CONNECT_PARTITION_SLOTS", System.schedulers_online() * 2) +metrics_cleaner_schedule_timer_in_ms = Env.get_integer("METRICS_CLEANER_SCHEDULE_TIMER_IN_MS", :timer.minutes(30)) +metrics_rpc_timeout_in_ms = Env.get_integer("METRICS_RPC_TIMEOUT_IN_MS", :timer.seconds(15)) +rebalance_check_interval_in_ms = Env.get_integer("REBALANCE_CHECK_INTERVAL_IN_MS", :timer.minutes(10)) +tenant_max_bytes_per_second = Env.get_integer("TENANT_MAX_BYTES_PER_SECOND", 100_000) +tenant_max_channels_per_client = Env.get_integer("TENANT_MAX_CHANNELS_PER_CLIENT", 100) +tenant_max_concurrent_users = Env.get_integer("TENANT_MAX_CONCURRENT_USERS", 200) +tenant_max_events_per_second = Env.get_integer("TENANT_MAX_EVENTS_PER_SECOND", 100) +tenant_max_joins_per_second = Env.get_integer("TENANT_MAX_JOINS_PER_SECOND", 100) +rpc_timeout = Env.get_integer("RPC_TIMEOUT", :timer.seconds(30)) +max_gen_rpc_clients = Env.get_integer("MAX_GEN_RPC_CLIENTS", 5) +run_janitor? = Env.get_boolean("RUN_JANITOR", false) +janitor_schedule_randomize = Env.get_boolean("JANITOR_SCHEDULE_RANDOMIZE", true) +janitor_max_children = Env.get_integer("JANITOR_MAX_CHILDREN", 5) +janitor_chunk_size = Env.get_integer("JANITOR_CHUNK_SIZE", 10) +janitor_run_after_in_ms = Env.get_integer("JANITOR_RUN_AFTER_IN_MS", :timer.minutes(10)) +janitor_children_timeout = Env.get_integer("JANITOR_CHILDREN_TIMEOUT", :timer.seconds(5)) +janitor_schedule_timer = Env.get_integer("JANITOR_SCHEDULE_TIMER_IN_MS", :timer.hours(4)) +platform = if System.get_env("AWS_EXECUTION_ENV") == "AWS_ECS_FARGATE", do: :aws, else: :fly + no_channel_timeout_in_ms = if config_env() == :test, - do: 3000, - else: System.get_env("NO_CHANNEL_TIMEOUT_IN_MS", "600000") |> String.to_integer() + do: :timer.seconds(3), + else: Env.get_integer("NO_CHANNEL_TIMEOUT_IN_MS", :timer.minutes(10)) if !(db_version in [nil, "ipv6", "ipv4"]), do: raise("Invalid IP version, please set either ipv6 or ipv4") @@ -86,6 +91,20 @@ socket_options = end end +config :realtime, Realtime.Repo, + hostname: default_db_host, + username: username, + password: password, + database: database, + port: port, + pool_size: pool_size, + queue_target: queue_target, + queue_interval: queue_interval, + parameters: [application_name: "supabase_mt_realtime"], + after_connect: after_connect_query_args, + socket_options: socket_options, + ssl: ssl_opts + config :realtime, migration_partition_slots: migration_partition_slots, connect_partition_slots: connect_partition_slots, @@ -98,63 +117,96 @@ config :realtime, metrics_cleaner_schedule_timer_in_ms: metrics_cleaner_schedule_timer_in_ms, metrics_rpc_timeout: metrics_rpc_timeout_in_ms, tenant_cache_expiration: tenant_cache_expiration, - disconnect_socket_on_no_channels_interval_in_ms: disconnect_socket_on_no_channels_interval_in_ms, rpc_timeout: rpc_timeout, max_gen_rpc_clients: max_gen_rpc_clients, - no_channel_timeout_in_ms: no_channel_timeout_in_ms + no_channel_timeout_in_ms: no_channel_timeout_in_ms, + platform: platform -if config_env() == :test || !run_janitor? do - config :realtime, run_janitor: false -else +if config_env() != :test && run_janitor? do config :realtime, - # disabled for now by default - run_janitor: run_janitor?, + run_janitor: true, janitor_schedule_randomize: janitor_schedule_randomize, janitor_max_children: janitor_max_children, janitor_chunk_size: janitor_chunk_size, - # defaults the runner to only start after 10 minutes janitor_run_after_in_ms: janitor_run_after_in_ms, janitor_children_timeout: janitor_children_timeout, - # defaults to 4 hours janitor_schedule_timer: janitor_schedule_timer end -if config_env() == :prod do - secret_key_base = - System.get_env("SECRET_KEY_BASE") || - raise """ - environment variable SECRET_KEY_BASE is missing. - You can generate one by calling: mix phx.gen.secret - """ +default_cluster_strategy = + case config_env() do + :prod -> "POSTGRES" + _ -> "EPMD" + end - if app_name == "" do - raise "APP_NAME not available" +cluster_topologies = + System.get_env("CLUSTER_STRATEGIES", default_cluster_strategy) + |> String.upcase() + |> String.split(",") + |> Enum.reduce([], fn strategy, acc -> + strategy + |> String.trim() + |> then(fn + "DNS" -> + [ + dns: [ + strategy: Cluster.Strategy.DNSPoll, + config: [polling_interval: 5_000, query: System.get_env("DNS_NODES"), node_basename: app_name] + ] + ] ++ acc + + "POSTGRES" -> + [ + postgres: [ + strategy: LibclusterPostgres.Strategy, + config: [ + hostname: default_db_host, + username: username, + password: password, + database: database, + port: port, + parameters: [application_name: "cluster_node_#{node()}"], + socket_options: socket_options, + ssl: ssl_opts, + heartbeat_interval: 5_000 + ] + ] + ] ++ acc + + "EPMD" -> + [ + dev: [ + strategy: Cluster.Strategy.Epmd, + config: [hosts: [:"orange@127.0.0.1", :"pink@127.0.0.1"]], + connect: {:net_kernel, :connect_node, []}, + disconnect: {:net_kernel, :disconnect_node, []} + ] + ] ++ acc + + _ -> + acc + end) + end) + +# Setup Logging + +if System.get_env("LOGS_ENGINE") == "logflare" do + config :logflare_logger_backend, url: System.get_env("LOGFLARE_LOGGER_BACKEND_URL", "https://api.logflare.app") + + if !System.get_env("LOGFLARE_API_KEY") or !System.get_env("LOGFLARE_SOURCE_ID") do + raise """ + Environment variable LOGFLARE_API_KEY or LOGFLARE_SOURCE_ID is missing. + Check those variables or choose another LOGS_ENGINE. + """ end - config :realtime, RealtimeWeb.Endpoint, - server: true, - url: [host: "#{app_name}.fly.dev", port: 80], - http: [ - compress: true, - port: String.to_integer(System.get_env("PORT") || "4000"), - protocol_options: [ - max_header_value_length: String.to_integer(System.get_env("MAX_HEADER_LENGTH") || "4096") - ], - transport_options: [ - # max_connection is per connection supervisor - # num_conns_sups defaults to num_acceptors - # total conns accepted here is max_connections * num_acceptors - # ref: https://ninenines.eu/docs/en/ranch/2.0/manual/ranch/ - max_connections: String.to_integer(System.get_env("MAX_CONNECTIONS") || "1000"), - num_acceptors: String.to_integer(System.get_env("NUM_ACCEPTORS") || "100"), - # IMPORTANT: support IPv6 addresses - socket_opts: [:inet6] - ] - ], - check_origin: false, - secret_key_base: secret_key_base + config :logger, + sync_threshold: 6_000, + discard_threshold: 6_000, + backends: [LogflareLogger.HttpBackend] end +# Setup production and development environments if config_env() != :test do gen_rpc_socket_ip = System.get_env("GEN_RPC_SOCKET_IP", "0.0.0.0") |> to_charlist() @@ -210,8 +262,6 @@ if config_env() != :test do config :logger, level: System.get_env("LOG_LEVEL", "info") |> String.to_existing_atom() - platform = if System.get_env("AWS_EXECUTION_ENV") == "AWS_ECS_FARGATE", do: :aws, else: :fly - config :realtime, request_id_baggage_key: System.get_env("REQUEST_ID_BAGGAGE_KEY", "request-id"), jwt_claim_validators: System.get_env("JWT_CLAIM_VALIDATORS", "{}"), @@ -221,34 +271,36 @@ if config_env() != :test do metrics_jwt_secret: System.get_env("METRICS_JWT_SECRET"), db_enc_key: System.get_env("DB_ENC_KEY"), region: System.get_env("REGION"), - prom_poll_rate: System.get_env("PROM_POLL_RATE", "5000") |> String.to_integer(), - platform: platform, + prom_poll_rate: Env.get_integer("PROM_POLL_RATE", 5000), slot_name_suffix: slot_name_suffix +end - queue_target = System.get_env("DB_QUEUE_TARGET", "5000") |> String.to_integer() - queue_interval = System.get_env("DB_QUEUE_INTERVAL", "5000") |> String.to_integer() +# Setup Production - after_connect_query_args = - case System.get_env("DB_AFTER_CONNECT_QUERY") do - nil -> nil - query -> {Postgrex, :query!, [query, []]} - end +if config_env() == :prod do + config :libcluster, debug: false, topologies: cluster_topologies + secret_key_base = System.fetch_env!("SECRET_KEY_BASE") + if app_name == "", do: raise("APP_NAME not available") - config :realtime, Realtime.Repo, - hostname: default_db_host, - username: username, - password: password, - database: database, - port: port, - pool_size: System.get_env("DB_POOL_SIZE", "5") |> String.to_integer(), - queue_target: queue_target, - queue_interval: queue_interval, - parameters: [ - application_name: "supabase_mt_realtime" + config :realtime, RealtimeWeb.Endpoint, + server: true, + url: [host: "#{app_name}.supabase.co", port: 443], + http: [ + compress: true, + port: Env.get_integer("PORT", 4000), + protocol_options: [ + max_header_value_length: Env.get_integer("MAX_HEADER_LENGTH", 4096) + ], + transport_options: [ + max_connections: Env.get_integer("MAX_CONNECTIONS", 1000), + num_acceptors: Env.get_integer("NUM_ACCEPTORS", 100), + socket_opts: [:inet6] + ] ], - after_connect: after_connect_query_args, - socket_options: socket_options, - ssl: ssl_opts + check_origin: false, + secret_key_base: secret_key_base + + alias Realtime.Repo.Replica replica_repos = %{ Realtime.Repo.Replica.FRA => System.get_env("DB_HOST_REPLICA_FRA", default_db_host), @@ -281,87 +333,3 @@ if config_env() != :test do ssl: ssl_opts end end - -default_cluster_strategy = - config_env() - |> case do - :prod -> "DNS" - _ -> "EPMD" - end - -cluster_topologies = - System.get_env("CLUSTER_STRATEGIES", default_cluster_strategy) - |> String.upcase() - |> String.split(",") - |> Enum.reduce([], fn strategy, acc -> - strategy - |> String.trim() - |> case do - "DNS" -> - [ - fly6pn: [ - strategy: Cluster.Strategy.DNSPoll, - config: [ - polling_interval: 5_000, - query: System.get_env("DNS_NODES"), - node_basename: app_name - ] - ] - ] ++ acc - - "POSTGRES" -> - [ - postgres: [ - strategy: LibclusterPostgres.Strategy, - config: [ - hostname: default_db_host, - username: username, - password: password, - database: database, - port: port, - parameters: [ - application_name: "cluster_node_#{node()}" - ], - socket_options: socket_options, - ssl: ssl_opts, - heartbeat_interval: 5_000 - ] - ] - ] ++ acc - - "EPMD" -> - [ - dev: [ - strategy: Cluster.Strategy.Epmd, - config: [ - hosts: [:"orange@127.0.0.1", :"pink@127.0.0.1"] - ], - connect: {:net_kernel, :connect_node, []}, - disconnect: {:net_kernel, :disconnect_node, []} - ] - ] ++ acc - - _ -> - acc - end - end) - -if config_env() == :prod do - config :libcluster, - debug: false, - topologies: cluster_topologies -end - -if System.get_env("LOGS_ENGINE") == "logflare" do - if !System.get_env("LOGFLARE_API_KEY") or !System.get_env("LOGFLARE_SOURCE_ID") do - raise """ - Environment variable LOGFLARE_API_KEY or LOGFLARE_SOURCE_ID is missing. - Check those variables or choose another LOGS_ENGINE. - """ - end - - config :logger, - sync_threshold: 6_000, - discard_threshold: 6_000, - backends: [LogflareLogger.HttpBackend] -end diff --git a/lib/realtime/application.ex b/lib/realtime/application.ex index ff7dea923..0f4c9ae50 100644 --- a/lib/realtime/application.ex +++ b/lib/realtime/application.ex @@ -127,19 +127,18 @@ defmodule Realtime.Application do end defp janitor_tasks do - if Application.fetch_env!(:realtime, :run_janitor) do - janitor_max_children = - Application.get_env(:realtime, :janitor_max_children) - - janitor_children_timeout = - Application.get_env(:realtime, :janitor_children_timeout) + if Application.get_env(:realtime, :run_janitor) do + janitor_max_children = Application.get_env(:realtime, :janitor_max_children) + janitor_children_timeout = Application.get_env(:realtime, :janitor_children_timeout) [ - {Task.Supervisor, - name: Realtime.Tenants.Janitor.TaskSupervisor, - max_children: janitor_max_children, - max_seconds: janitor_children_timeout, - max_restarts: 1}, + { + Task.Supervisor, + name: Realtime.Tenants.Janitor.TaskSupervisor, + max_children: janitor_max_children, + max_seconds: janitor_children_timeout, + max_restarts: 1 + }, Realtime.Tenants.Janitor, Realtime.MetricsCleaner ] diff --git a/lib/realtime/nodes.ex b/lib/realtime/nodes.ex index a2a11370c..ae237eb5f 100644 --- a/lib/realtime/nodes.ex +++ b/lib/realtime/nodes.ex @@ -27,11 +27,6 @@ defmodule Realtime.Nodes do def platform_region_translator(nil), do: nil def platform_region_translator(tenant_region) when is_binary(tenant_region) do - platform = Application.get_env(:realtime, :platform) - region_mapping(platform, tenant_region) - end - - defp region_mapping(:aws, tenant_region) do case tenant_region do "ap-east-1" -> "ap-southeast-1" "ap-northeast-1" -> "ap-southeast-1" @@ -55,28 +50,6 @@ defmodule Realtime.Nodes do end end - defp region_mapping(:fly, tenant_region) do - case tenant_region do - "us-east-1" -> "iad" - "us-west-1" -> "sea" - "sa-east-1" -> "iad" - "ca-central-1" -> "iad" - "ap-southeast-1" -> "syd" - "ap-northeast-1" -> "syd" - "ap-northeast-2" -> "syd" - "ap-southeast-2" -> "syd" - "ap-east-1" -> "syd" - "ap-south-1" -> "syd" - "eu-west-1" -> "lhr" - "eu-west-2" -> "lhr" - "eu-west-3" -> "lhr" - "eu-central-1" -> "lhr" - _ -> nil - end - end - - defp region_mapping(_, tenant_region), do: tenant_region - @doc """ Lists the nodes in a region. Sorts by node name in case the list order is unstable. diff --git a/mix.exs b/mix.exs index a9f47990d..13ffe985a 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.46.1", + version: "2.46.3", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/run.sh b/run.sh index 2dddbc1b8..66585dc2b 100755 --- a/run.sh +++ b/run.sh @@ -90,7 +90,7 @@ if [ "${ENABLE_ERL_CRASH_DUMP:-false}" = true ]; then trap upload_crash_dump_to_s3 INT TERM KILL EXIT fi -if [[ -n "${GENERATE_CLUSTER_CERTS}" ]] ; then +if [[ -n "${GENERATE_CLUSTER_CERTS:-}" ]] ; then generate_certs fi diff --git a/test/realtime/monitoring/distributed_metrics_test.exs b/test/realtime/monitoring/distributed_metrics_test.exs index a1cf89777..491083973 100644 --- a/test/realtime/monitoring/distributed_metrics_test.exs +++ b/test/realtime/monitoring/distributed_metrics_test.exs @@ -32,28 +32,5 @@ defmodule Realtime.DistributedMetricsTest do } } = DistributedMetrics.info() end - - test "metric matches on both sides", %{node: node} do - # We need to generate some data first - Realtime.Rpc.call(node, String, :to_integer, ["25"], key: 1) - Realtime.Rpc.call(node, String, :to_integer, ["25"], key: 2) - - local_metrics = DistributedMetrics.info()[node][:inet_stats] - # Use gen_rpc to not use erl dist and change the result - remote_metrics = :gen_rpc.call(node, DistributedMetrics, :info, [])[node()][:inet_stats] - - # It's not going to 100% the same because erl dist sends pings and other things out of our control - - assert local_metrics[:connections] == remote_metrics[:connections] - - assert_in_delta(local_metrics[:send_avg], remote_metrics[:recv_avg], 5) - assert_in_delta(local_metrics[:recv_avg], remote_metrics[:send_avg], 5) - - assert_in_delta(local_metrics[:send_oct], remote_metrics[:recv_oct], 5) - assert_in_delta(local_metrics[:recv_oct], remote_metrics[:send_oct], 5) - - assert_in_delta(local_metrics[:send_max], remote_metrics[:recv_max], 5) - assert_in_delta(local_metrics[:recv_max], remote_metrics[:send_max], 5) - end end end diff --git a/test/realtime/rate_counter/rate_counter_test.exs b/test/realtime/rate_counter/rate_counter_test.exs index c4a01bd74..6d3f57401 100644 --- a/test/realtime/rate_counter/rate_counter_test.exs +++ b/test/realtime/rate_counter/rate_counter_test.exs @@ -227,11 +227,12 @@ defmodule Realtime.RateCounterTest do log = capture_log(fn -> - GenCounter.add(args.id, 50) + GenCounter.add(args.id, 100) Process.sleep(100) end) - assert {:ok, %RateCounter{sum: 50, limit: %{triggered: true}}} = RateCounter.get(args) + assert {:ok, %RateCounter{sum: sum, limit: %{triggered: true}}} = RateCounter.get(args) + assert sum > 49 assert log =~ "project=tenant123 external_id=tenant123 [error] ErrorMessage: Reason" # Only one log message should be emitted From aabf9df233a0e3e1e0f0895e8c4f59257aeb51e6 Mon Sep 17 00:00:00 2001 From: "Al @h0lybyte" <5599058+h0lybyte@users.noreply.github.com> Date: Wed, 3 Sep 2025 10:49:12 -0400 Subject: [PATCH 2/8] =?UTF-8?q?=F0=9F=94=84=20Sync=20with=20upstream=20cha?= =?UTF-8?q?nges=20(#4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: runtime setup error (#1520) * fix: use primary instead of replica on rename_settings_field (#1521) --------- Co-authored-by: Filipe Cabaço Co-authored-by: Eduardo Gurgel Co-authored-by: Bradley Haljendi <5642609+Fudster@users.noreply.github.com> --- lib/realtime/api.ex | 9 +++------ mix.exs | 2 +- test/realtime/api_test.exs | 4 ---- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/lib/realtime/api.ex b/lib/realtime/api.ex index 23e28feab..c504d0187 100644 --- a/lib/realtime/api.ex +++ b/lib/realtime/api.ex @@ -186,12 +186,9 @@ defmodule Realtime.Api do |> repo.preload(:extensions) end - def list_extensions(type \\ "postgres_cdc_rls") do - from(e in Extensions, - where: e.type == ^type, - select: e - ) - |> Replica.replica().all() + defp list_extensions(type \\ "postgres_cdc_rls") do + from(e in Extensions, where: e.type == ^type, select: e) + |> Repo.all() end def rename_settings_field(from, to) do diff --git a/mix.exs b/mix.exs index 13ffe985a..c0d4e1516 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.46.3", + version: "2.46.4", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/api_test.exs b/test/realtime/api_test.exs index 1c4a816b0..55dc609eb 100644 --- a/test/realtime/api_test.exs +++ b/test/realtime/api_test.exs @@ -236,10 +236,6 @@ defmodule Realtime.ApiTest do end end - test "list_extensions/1 ", %{tenants: tenants} do - assert length(Api.list_extensions()) == length(tenants) - end - describe "preload_counters/1" do test "preloads counters for a given tenant ", %{tenants: [tenant | _]} do tenant = Repo.reload!(tenant) From af2bc04c2bd752e88c4905d9df4be9468e73cf9c Mon Sep 17 00:00:00 2001 From: "Al @h0lybyte" <5599058+h0lybyte@users.noreply.github.com> Date: Thu, 4 Sep 2025 16:58:33 -0400 Subject: [PATCH 3/8] =?UTF-8?q?=F0=9F=94=84=20Sync=20with=20upstream=20cha?= =?UTF-8?q?nges=20(#6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: runtime setup error (#1520) * fix: use primary instead of replica on rename_settings_field (#1521) --------- Co-authored-by: Filipe Cabaço Co-authored-by: Eduardo Gurgel From 5e3f5edde1a0c7eea19e7103611c32bf5c560826 Mon Sep 17 00:00:00 2001 From: "Al @h0lybyte" <5599058+h0lybyte@users.noreply.github.com> Date: Tue, 23 Sep 2025 13:22:34 -0400 Subject: [PATCH 4/8] =?UTF-8?q?=F0=9F=94=84=20Sync=20with=20upstream=20cha?= =?UTF-8?q?nges=20(#7)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: runtime setup error (#1520) * fix: use primary instead of replica on rename_settings_field (#1521) * feat: upgrade cowboy & ranch (#1523) * fix: Fix GenRpc to not try to connect to nodes that are not alive (#1525) * fix: enable presence on track message (#1527) currently the user would need to have enabled from the beginning of the channel. this will enable users to enable presence later in the flow by sending a track message which will enable presence messages for them * fix: set cowboy active_n=100 as cowboy 2.12.0 (#1530) cowboy 2.13.0 set the default active_n=1 * fix: provide error_code metadata on RealtimeChannel.Logging (#1531) * feat: disable UTF8 validation on websocket frames (#1532) Currently all text frames as handled only with JSON which already requires UTF-8 * fix: move DB setup to happen after Connect.init (#1533) This change reduces the impact of slow DB setup impacting other tenants trying to connect at the same time that landed on the same partition * fix: handle wal bloat (#1528) Verify that replication connection is able to reconnect when faced with WAL bloat issues * feat: replay realtime.messages (#1526) A new index was created on inserted_at DESC, topic WHERE private IS TRUE AND extension = "broadast" The hardcoded limit is 25 for now. * feat: gen_rpc pub sub adapter (#1529) Add a PubSub adapter that uses gen_rpc to send messages to other nodes. It uses :gen_rpc.abcast/3 instead of :erlang.send/2 The adapter works very similarly to the PG2 adapter. It consists of multiple workers that forward to the local node using PubSub.local_broadcast. The way to choose the worker to be used is based on the sending process just like PG2 adapter does The number of workers is controlled by `:pool_size` or `:broadcast_pool_size`. This distinction exists because Phoenix.PubSub uses `:pool_size` to define how many partitions the PubSub registry will use. It's possible to control them separately by using `:broadcast_pool_size` * fix: ensure message id doesn't raise on non-map payloads (#1534) * fix: match error on Connect (#1536) --------- Co-authored-by: Eduardo Gurgel Pinho * feat: websocket max heap size configuration (#1538) * fix: set max process heap size to 500MB instead of 8GB * feat: set websocket transport max heap size WEBSOCKET_MAX_HEAP_SIZE can be used to configure it * fix: update gen_rpc to fix gen_rpc_dispatcher issues (#1537) Issues: * Single gen_rpc_dispatcher that can be a bottleneck if the connecting takes some time * Many calls can land on the dispatcher but the node might be gone already. If we don't validate the node it might keep trying to connect until it times out instead of quickly giving up due to not being an actively connected node. * fix: improve ErlSysMon logging for processes (#1540) Include initial_call, ancestors, registered_name, message_queue_len and total_heap_size Also bump long_schedule and long_gc * fix: make pubsub adapter configurable (#1539) --------- Co-authored-by: Filipe Cabaço Co-authored-by: Eduardo Gurgel Co-authored-by: Bradley Haljendi <5642609+Fudster@users.noreply.github.com> --- README.md | 6 +- config/runtime.exs | 8 +- config/test.exs | 2 +- lib/realtime/api.ex | 7 +- lib/realtime/api/message.ex | 4 +- lib/realtime/application.ex | 12 +- lib/realtime/gen_rpc.ex | 33 +++ lib/realtime/gen_rpc/pub_sub.ex | 78 ++++++ lib/realtime/messages.ex | 55 +++++ lib/realtime/monitoring/erl_sys_mon.ex | 34 ++- .../monitoring/prom_ex/plugins/phoenix.ex | 13 +- lib/realtime/syn_handler.ex | 6 +- lib/realtime/tenants/batch_broadcast.ex | 38 +-- lib/realtime/tenants/connect.ex | 130 ++++++---- .../tenants/connect/check_connection.ex | 4 +- .../tenants/connect/start_counters.ex | 60 ----- lib/realtime/tenants/migrations.ex | 6 +- .../tenants/replication_connection.ex | 10 +- ...0905041441_create_messages_replay_index.ex | 11 + .../channels/payloads/broadcast.ex | 2 + .../channels/payloads/broadcast/replay.ex | 17 ++ lib/realtime_web/channels/realtime_channel.ex | 53 +++- .../channels/realtime_channel/logging.ex | 10 +- .../realtime_channel/message_dispatcher.ex | 44 +++- .../realtime_channel/presence_handler.ex | 16 +- lib/realtime_web/channels/user_socket.ex | 10 + lib/realtime_web/endpoint.ex | 9 + lib/realtime_web/tenant_broadcaster.ex | 26 +- mix.exs | 4 +- mix.lock | 8 +- rel/vm.args.eex | 6 +- test/integration/rt_channel_test.exs | 186 +++++++++++++- test/realtime/gen_rpc_pub_sub_test.exs | 2 + test/realtime/gen_rpc_test.exs | 45 ++++ test/realtime/messages_test.exs | 233 ++++++++++++++++-- test/realtime/monitoring/erl_sys_mon_test.exs | 27 +- .../prom_ex/plugins/phoenix_test.exs | 17 +- test/realtime/syn_handler_test.exs | 16 +- test/realtime/tenants/connect_test.exs | 136 +++++++--- .../tenants/janitor/maintenance_task_test.exs | 11 +- test/realtime/tenants/janitor_test.exs | 14 +- .../tenants/replication_connection_test.exs | 73 +++++- .../channels/payloads/join_test.exs | 17 +- .../realtime_channel/logging_test.exs | 27 +- .../message_dispatcher_test.exs | 90 ++++++- .../presence_handler_test.exs | 82 +++++- .../channels/realtime_channel_test.exs | 210 ++++++++++++++++ test/realtime_web/tenant_broadcaster_test.exs | 140 ++++++----- test/support/containers.ex | 8 +- 49 files changed, 1670 insertions(+), 386 deletions(-) create mode 100644 lib/realtime/gen_rpc/pub_sub.ex delete mode 100644 lib/realtime/tenants/connect/start_counters.ex create mode 100644 lib/realtime/tenants/repo/migrations/20250905041441_create_messages_replay_index.ex create mode 100644 lib/realtime_web/channels/payloads/broadcast/replay.ex create mode 100644 test/realtime/gen_rpc_pub_sub_test.exs diff --git a/README.md b/README.md index 2235bf388..4e13e44df 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ You can add your own by making a `POST` request to the server. You must change b "region": "us-west-1", "poll_interval_ms": 100, "poll_max_record_bytes": 1048576, - "ssl_enforced": false + "ssl_enforced": false } } ] @@ -169,6 +169,7 @@ If you're using the default tenant, the URL is `ws://realtime-dev.localhost:4000 | CONNECT_PARTITION_SLOTS | number | Number of dynamic supervisor partitions used by the Connect, ReplicationConnect processes | | METRICS_CLEANER_SCHEDULE_TIMER_IN_MS | number | Time in ms to run the Metric Cleaner task | | METRICS_RPC_TIMEOUT_IN_MS | number | Time in ms to wait for RPC call to fetch Metric per node | +| WEBSOCKET_MAX_HEAP_SIZE | number | Max number of bytes to be allocated as heap for the WebSocket transport process. If the limit is reached the process is brutally killed. Defaults to 50MB. | | REQUEST_ID_BAGGAGE_KEY | string | OTEL Baggage key to be used as request id | | OTEL_SDK_DISABLED | boolean | Disable OpenTelemetry tracing completely when 'true' | | OTEL_TRACES_EXPORTER | string | Possible values: `otlp` or `none`. See [https://github.com/open-telemetry/opentelemetry-erlang/tree/v1.4.0/apps#os-environment] for more details on how to configure the traces exporter. | @@ -190,6 +191,8 @@ If you're using the default tenant, the URL is `ws://realtime-dev.localhost:4000 | MAX_GEN_RPC_CLIENTS | number | Max amount of `gen_rpc` TCP connections per node-to-node channel | | REBALANCE_CHECK_INTERVAL_IN_MS | number | Time in ms to check if process is in the right region | | DISCONNECT_SOCKET_ON_NO_CHANNELS_INTERVAL_IN_MS | number | Time in ms to check if a socket has no channels open and if so, disconnect it | +| BROADCAST_POOL_SIZE | number | Number of processes to relay Phoenix.PubSub messages across the cluster | + The OpenTelemetry variables mentioned above are not an exhaustive list of all [supported environment variables](https://opentelemetry.io/docs/languages/sdk-configuration/). @@ -284,6 +287,7 @@ This is the list of operational codes that can help you understand your deployme | UnknownErrorOnController | An error we are not handling correctly was triggered on a controller | | UnknownErrorOnChannel | An error we are not handling correctly was triggered on a channel | | PresenceRateLimitReached | Limit of presence events reached | +| UnableToReplayMessages | An error while replaying messages | ## License diff --git a/config/runtime.exs b/config/runtime.exs index ac0a2569b..47961f98a 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -67,6 +67,9 @@ janitor_run_after_in_ms = Env.get_integer("JANITOR_RUN_AFTER_IN_MS", :timer.minu janitor_children_timeout = Env.get_integer("JANITOR_CHILDREN_TIMEOUT", :timer.seconds(5)) janitor_schedule_timer = Env.get_integer("JANITOR_SCHEDULE_TIMER_IN_MS", :timer.hours(4)) platform = if System.get_env("AWS_EXECUTION_ENV") == "AWS_ECS_FARGATE", do: :aws, else: :fly +broadcast_pool_size = Env.get_integer("BROADCAST_POOL_SIZE", 10) +pubsub_adapter = System.get_env("PUBSUB_ADAPTER", "pg2") |> String.to_atom() +websocket_max_heap_size = div(Env.get_integer("WEBSOCKET_MAX_HEAP_SIZE", 50_000_000), :erlang.system_info(:wordsize)) no_channel_timeout_in_ms = if config_env() == :test, @@ -106,6 +109,7 @@ config :realtime, Realtime.Repo, ssl: ssl_opts config :realtime, + websocket_max_heap_size: websocket_max_heap_size, migration_partition_slots: migration_partition_slots, connect_partition_slots: connect_partition_slots, rebalance_check_interval_in_ms: rebalance_check_interval_in_ms, @@ -120,7 +124,9 @@ config :realtime, rpc_timeout: rpc_timeout, max_gen_rpc_clients: max_gen_rpc_clients, no_channel_timeout_in_ms: no_channel_timeout_in_ms, - platform: platform + platform: platform, + pubsub_adapter: pubsub_adapter, + broadcast_pool_size: broadcast_pool_size if config_env() != :test && run_janitor? do config :realtime, diff --git a/config/test.exs b/config/test.exs index 4c7c66ae8..a69c51701 100644 --- a/config/test.exs +++ b/config/test.exs @@ -47,7 +47,7 @@ config :logger, # Configures Elixir's Logger config :logger, :console, format: "$time $metadata[$level] $message\n", - metadata: [:request_id, :project, :external_id, :application_name, :sub, :iss, :exp] + metadata: [:error_code, :request_id, :project, :external_id, :application_name, :sub, :iss, :exp] config :opentelemetry, span_processor: :simple, diff --git a/lib/realtime/api.ex b/lib/realtime/api.ex index c504d0187..f612a5c1e 100644 --- a/lib/realtime/api.ex +++ b/lib/realtime/api.ex @@ -186,9 +186,10 @@ defmodule Realtime.Api do |> repo.preload(:extensions) end - defp list_extensions(type \\ "postgres_cdc_rls") do - from(e in Extensions, where: e.type == ^type, select: e) - |> Repo.all() + defp list_extensions(type) do + query = from(e in Extensions, where: e.type == ^type, select: e) + + Repo.all(query) end def rename_settings_field(from, to) do diff --git a/lib/realtime/api/message.ex b/lib/realtime/api/message.ex index 90ebc5bc9..18bbc9a87 100644 --- a/lib/realtime/api/message.ex +++ b/lib/realtime/api/message.ex @@ -8,6 +8,8 @@ defmodule Realtime.Api.Message do @primary_key {:id, Ecto.UUID, autogenerate: true} @schema_prefix "realtime" + @type t :: %__MODULE__{} + schema "messages" do field(:topic, :string) field(:extension, Ecto.Enum, values: [:broadcast, :presence]) @@ -39,7 +41,7 @@ defmodule Realtime.Api.Message do end defp maybe_put_timestamp(changeset, field) do - case Map.get(changeset.data, field) do + case get_field(changeset, field) do nil -> put_timestamp(changeset, field) _ -> changeset end diff --git a/lib/realtime/application.ex b/lib/realtime/application.ex index 0f4c9ae50..99096edfb 100644 --- a/lib/realtime/application.ex +++ b/lib/realtime/application.ex @@ -52,6 +52,7 @@ defmodule Realtime.Application do region = Application.get_env(:realtime, :region) :syn.join(RegionNodes, region, self(), node: node()) + broadcast_pool_size = Application.get_env(:realtime, :broadcast_pool_size, 10) migration_partition_slots = Application.get_env(:realtime, :migration_partition_slots) connect_partition_slots = Application.get_env(:realtime, :connect_partition_slots) no_channel_timeout_in_ms = Application.get_env(:realtime, :no_channel_timeout_in_ms) @@ -65,7 +66,8 @@ defmodule Realtime.Application do Realtime.Repo, RealtimeWeb.Telemetry, {Cluster.Supervisor, [topologies, [name: Realtime.ClusterSupervisor]]}, - {Phoenix.PubSub, name: Realtime.PubSub, pool_size: 10}, + {Phoenix.PubSub, + name: Realtime.PubSub, pool_size: 10, adapter: pubsub_adapter(), broadcast_pool_size: broadcast_pool_size}, {Cachex, name: Realtime.RateCounter}, Realtime.Tenants.Cache, Realtime.RateCounter.DynamicSupervisor, @@ -152,4 +154,12 @@ defmodule Realtime.Application do OpentelemetryPhoenix.setup(adapter: :cowboy2) OpentelemetryEcto.setup([:realtime, :repo], db_statement: :enabled) end + + defp pubsub_adapter do + if Application.fetch_env!(:realtime, :pubsub_adapter) == :gen_rpc do + Realtime.GenRpcPubSub + else + Phoenix.PubSub.PG2 + end + end end diff --git a/lib/realtime/gen_rpc.ex b/lib/realtime/gen_rpc.ex index bb7099242..a7b46a869 100644 --- a/lib/realtime/gen_rpc.ex +++ b/lib/realtime/gen_rpc.ex @@ -10,6 +10,22 @@ defmodule Realtime.GenRpc do @type result :: any | {:error, :rpc_error, reason :: any} + @doc """ + Broadcasts the message `msg` asynchronously to the registered process `name` on the specified `nodes`. + + Options: + + - `:key` - Optional key to consistently select the same gen_rpc clients to guarantee message order between nodes + """ + @spec abcast([node], atom, any, keyword()) :: :ok + def abcast(nodes, name, msg, opts) when is_list(nodes) and is_atom(name) and is_list(opts) do + key = Keyword.get(opts, :key, nil) + nodes = rpc_nodes(nodes, key) + + :gen_rpc.abcast(nodes, name, msg) + :ok + end + @doc """ Fire and forget apply(mod, func, args) on all nodes @@ -41,6 +57,23 @@ defmodule Realtime.GenRpc do @spec call(node, module, atom, list(any), keyword()) :: result def call(node, mod, func, args, opts) when is_atom(node) and is_atom(mod) and is_atom(func) and is_list(args) and is_list(opts) do + if node == node() or node in Node.list() do + do_call(node, mod, func, args, opts) + else + tenant_id = Keyword.get(opts, :tenant_id) + + log_error( + "ErrorOnRpcCall", + %{target: node, mod: mod, func: func, error: :badnode}, + project: tenant_id, + external_id: tenant_id + ) + + {:error, :rpc_error, :badnode} + end + end + + defp do_call(node, mod, func, args, opts) do timeout = Keyword.get(opts, :timeout, default_rpc_timeout()) tenant_id = Keyword.get(opts, :tenant_id) key = Keyword.get(opts, :key, nil) diff --git a/lib/realtime/gen_rpc/pub_sub.ex b/lib/realtime/gen_rpc/pub_sub.ex new file mode 100644 index 000000000..b2a90b165 --- /dev/null +++ b/lib/realtime/gen_rpc/pub_sub.ex @@ -0,0 +1,78 @@ +defmodule Realtime.GenRpcPubSub do + @moduledoc """ + gen_rpc Phoenix.PubSub adapter + """ + + @behaviour Phoenix.PubSub.Adapter + alias Realtime.GenRpc + use Supervisor + + @impl true + def node_name(_), do: node() + + # Supervisor callbacks + + def start_link(opts) do + adapter_name = Keyword.fetch!(opts, :adapter_name) + name = Keyword.fetch!(opts, :name) + pool_size = Keyword.get(opts, :pool_size, 1) + broadcast_pool_size = Keyword.get(opts, :broadcast_pool_size, pool_size) + + Supervisor.start_link(__MODULE__, {adapter_name, name, broadcast_pool_size}, + name: :"#{name}#{adapter_name}_supervisor" + ) + end + + @impl true + def init({adapter_name, pubsub, pool_size}) do + workers = for number <- 1..pool_size, do: :"#{pubsub}#{adapter_name}_#{number}" + + :persistent_term.put(adapter_name, List.to_tuple(workers)) + + children = + for worker <- workers do + Supervisor.child_spec({Realtime.GenRpcPubSub.Worker, {pubsub, worker}}, id: worker) + end + + Supervisor.init(children, strategy: :one_for_one) + end + + defp worker_name(adapter_name, key) do + workers = :persistent_term.get(adapter_name) + elem(workers, :erlang.phash2(key, tuple_size(workers))) + end + + @impl true + def broadcast(adapter_name, topic, message, dispatcher) do + worker = worker_name(adapter_name, self()) + GenRpc.abcast(Node.list(), worker, forward_to_local(topic, message, dispatcher), key: worker) + end + + @impl true + def direct_broadcast(adapter_name, node_name, topic, message, dispatcher) do + worker = worker_name(adapter_name, self()) + GenRpc.abcast([node_name], worker, forward_to_local(topic, message, dispatcher), key: worker) + end + + defp forward_to_local(topic, message, dispatcher), do: {:ftl, topic, message, dispatcher} +end + +defmodule Realtime.GenRpcPubSub.Worker do + @moduledoc false + use GenServer + + @doc false + def start_link({pubsub, worker}), do: GenServer.start_link(__MODULE__, pubsub, name: worker) + + @impl true + def init(pubsub), do: {:ok, pubsub} + + @impl true + def handle_info({:ftl, topic, message, dispatcher}, pubsub) do + Phoenix.PubSub.local_broadcast(pubsub, topic, message, dispatcher) + {:noreply, pubsub} + end + + @impl true + def handle_info(_, pubsub), do: {:noreply, pubsub} +end diff --git a/lib/realtime/messages.ex b/lib/realtime/messages.ex index c6d571db7..804a48d66 100644 --- a/lib/realtime/messages.ex +++ b/lib/realtime/messages.ex @@ -3,6 +3,61 @@ defmodule Realtime.Messages do Handles `realtime.messages` table operations """ + alias Realtime.Api.Message + + import Ecto.Query, only: [from: 2] + + @hard_limit 25 + @default_timeout 5_000 + + @doc """ + Fetch last `limit ` messages for a given `topic` inserted after `since` + + Automatically uses RPC if the database connection is not in the same node + + Only allowed for private channels + """ + @spec replay(pid, String.t(), non_neg_integer, non_neg_integer) :: + {:ok, Message.t(), [String.t()]} | {:error, term} | {:error, :rpc_error, term} + def replay(conn, topic, since, limit) when node(conn) == node() and is_integer(since) and is_integer(limit) do + limit = max(min(limit, @hard_limit), 1) + + with {:ok, since} <- DateTime.from_unix(since, :millisecond), + {:ok, messages} <- messages(conn, topic, since, limit) do + {:ok, Enum.reverse(messages), MapSet.new(messages, & &1.id)} + else + {:error, :postgrex_exception} -> {:error, :failed_to_replay_messages} + {:error, :invalid_unix_time} -> {:error, :invalid_replay_params} + error -> error + end + end + + def replay(conn, topic, since, limit) when is_integer(since) and is_integer(limit) do + Realtime.GenRpc.call(node(conn), __MODULE__, :replay, [conn, topic, since, limit], key: topic) + end + + def replay(_, _, _, _), do: {:error, :invalid_replay_params} + + defp messages(conn, topic, since, limit) do + since = DateTime.to_naive(since) + # We want to avoid searching partitions in the future as they should be empty + # so we limit to 1 minute in the future to account for any potential drift + now = NaiveDateTime.utc_now() |> NaiveDateTime.add(1, :minute) + + query = + from m in Message, + where: + m.topic == ^topic and + m.private == true and + m.extension == :broadcast and + m.inserted_at >= ^since and + m.inserted_at < ^now, + limit: ^limit, + order_by: [desc: m.inserted_at] + + Realtime.Repo.all(conn, query, Message, timeout: @default_timeout) + end + @doc """ Deletes messages older than 72 hours for a given tenant connection """ diff --git a/lib/realtime/monitoring/erl_sys_mon.ex b/lib/realtime/monitoring/erl_sys_mon.ex index 32a4f857b..3278886d6 100644 --- a/lib/realtime/monitoring/erl_sys_mon.ex +++ b/lib/realtime/monitoring/erl_sys_mon.ex @@ -10,8 +10,8 @@ defmodule Realtime.ErlSysMon do @defaults [ :busy_dist_port, :busy_port, - {:long_gc, 250}, - {:long_schedule, 100}, + {:long_gc, 500}, + {:long_schedule, 500}, {:long_message_queue, {0, 1_000}} ] @@ -24,8 +24,36 @@ defmodule Realtime.ErlSysMon do {:ok, []} end + def handle_info({:monitor, pid, _type, _meta} = msg, state) when is_pid(pid) do + log_process_info(msg, pid) + {:noreply, state} + end + def handle_info(msg, state) do - Logger.error("#{__MODULE__} message: " <> inspect(msg)) + Logger.warning("#{__MODULE__} message: " <> inspect(msg)) {:noreply, state} end + + defp log_process_info(msg, pid) do + pid_info = + pid + |> Process.info(:dictionary) + |> case do + {:dictionary, dict} when is_list(dict) -> + {List.keyfind(dict, :"$initial_call", 0), List.keyfind(dict, :"$ancestors", 0)} + + other -> + other + end + + extra_info = Process.info(pid, [:registered_name, :message_queue_len, :total_heap_size]) + + Logger.warning( + "#{__MODULE__} message: " <> + inspect(msg) <> "|\n process info: #{inspect(pid_info)} #{inspect(extra_info)}" + ) + rescue + _ -> + Logger.warning("#{__MODULE__} message: " <> inspect(msg)) + end end diff --git a/lib/realtime/monitoring/prom_ex/plugins/phoenix.ex b/lib/realtime/monitoring/prom_ex/plugins/phoenix.ex index d3f64afbe..6cc3709d2 100644 --- a/lib/realtime/monitoring/prom_ex/plugins/phoenix.ex +++ b/lib/realtime/monitoring/prom_ex/plugins/phoenix.ex @@ -57,15 +57,10 @@ if Code.ensure_loaded?(Phoenix) do def execute_metrics do active_conn = - case :ets.lookup(:ranch_server, {:listener_sup, HTTP}) do - [] -> - -1 - - _ -> - HTTP - |> :ranch_server.get_connections_sup() - |> :supervisor.count_children() - |> Keyword.get(:active) + if :ranch.info()[HTTP] do + :ranch.info(HTTP)[:all_connections] + else + -1 end :telemetry.execute(@event_all_connections, %{active: active_conn}, %{}) diff --git a/lib/realtime/syn_handler.ex b/lib/realtime/syn_handler.ex index 397c8cf8f..d2fa5541c 100644 --- a/lib/realtime/syn_handler.ex +++ b/lib/realtime/syn_handler.ex @@ -10,9 +10,9 @@ defmodule Realtime.SynHandler do @behaviour :syn_event_handler @impl true - def on_registry_process_updated(Connect, tenant_id, _pid, %{conn: conn}, :normal) when is_pid(conn) do + def on_registry_process_updated(Connect, tenant_id, pid, %{conn: conn}, :normal) when is_pid(conn) do # Update that a database connection is ready - Endpoint.local_broadcast(Connect.syn_topic(tenant_id), "ready", %{conn: conn}) + Endpoint.local_broadcast(Connect.syn_topic(tenant_id), "ready", %{pid: pid, conn: conn}) end def on_registry_process_updated(PostgresCdcRls, tenant_id, _pid, meta, _reason) do @@ -38,7 +38,7 @@ defmodule Realtime.SynHandler do end topic = topic(mod) - Endpoint.local_broadcast(topic <> ":" <> name, topic <> "_down", nil) + Endpoint.local_broadcast(topic <> ":" <> name, topic <> "_down", %{pid: pid, reason: reason}) :ok end diff --git a/lib/realtime/tenants/batch_broadcast.ex b/lib/realtime/tenants/batch_broadcast.ex index 4fc31aa0f..98427621b 100644 --- a/lib/realtime/tenants/batch_broadcast.ex +++ b/lib/realtime/tenants/batch_broadcast.ex @@ -29,7 +29,9 @@ defmodule Realtime.Tenants.BatchBroadcast do @spec broadcast( auth_params :: map() | nil, tenant :: Tenant.t(), - messages :: %{messages: list(%{topic: String.t(), payload: map(), event: String.t(), private: boolean()})}, + messages :: %{ + messages: list(%{id: String.t(), topic: String.t(), payload: map(), event: String.t(), private: boolean()}) + }, super_user :: boolean() ) :: :ok | {:error, atom()} def broadcast(auth_params, tenant, messages, super_user \\ false) @@ -59,8 +61,8 @@ defmodule Realtime.Tenants.BatchBroadcast do # Handle events for public channel events |> Map.get(false, []) - |> Enum.each(fn %{topic: sub_topic, payload: payload, event: event} -> - send_message_and_count(tenant, events_per_second_rate, sub_topic, event, payload, true) + |> Enum.each(fn message -> + send_message_and_count(tenant, events_per_second_rate, message, true) end) # Handle events for private channel @@ -69,14 +71,14 @@ defmodule Realtime.Tenants.BatchBroadcast do |> Enum.group_by(fn event -> Map.get(event, :topic) end) |> Enum.each(fn {topic, events} -> if super_user do - Enum.each(events, fn %{topic: sub_topic, payload: payload, event: event} -> - send_message_and_count(tenant, events_per_second_rate, sub_topic, event, payload, false) + Enum.each(events, fn message -> + send_message_and_count(tenant, events_per_second_rate, message, false) end) else case permissions_for_message(tenant, auth_params, topic) do %Policies{broadcast: %BroadcastPolicies{write: true}} -> - Enum.each(events, fn %{topic: sub_topic, payload: payload, event: event} -> - send_message_and_count(tenant, events_per_second_rate, sub_topic, event, payload, false) + Enum.each(events, fn message -> + send_message_and_count(tenant, events_per_second_rate, message, false) end) _ -> @@ -91,15 +93,15 @@ defmodule Realtime.Tenants.BatchBroadcast do def broadcast(_, nil, _, _), do: {:error, :tenant_not_found} - def changeset(payload, attrs) do + defp changeset(payload, attrs) do payload |> cast(attrs, []) |> cast_embed(:messages, required: true, with: &message_changeset/2) end - def message_changeset(message, attrs) do + defp message_changeset(message, attrs) do message - |> cast(attrs, [:topic, :payload, :event, :private]) + |> cast(attrs, [:id, :topic, :payload, :event, :private]) |> maybe_put_private_change() |> validate_required([:topic, :payload, :event]) end @@ -112,11 +114,19 @@ defmodule Realtime.Tenants.BatchBroadcast do end @event_type "broadcast" - defp send_message_and_count(tenant, events_per_second_rate, topic, event, payload, public?) do - tenant_topic = Tenants.tenant_topic(tenant, topic, public?) - payload = %{"payload" => payload, "event" => event, "type" => "broadcast"} + defp send_message_and_count(tenant, events_per_second_rate, message, public?) do + tenant_topic = Tenants.tenant_topic(tenant, message.topic, public?) - broadcast = %Phoenix.Socket.Broadcast{topic: topic, event: @event_type, payload: payload} + payload = %{"payload" => message.payload, "event" => message.event, "type" => "broadcast"} + + payload = + if message[:id] do + Map.put(payload, "meta", %{"id" => message.id}) + else + payload + end + + broadcast = %Phoenix.Socket.Broadcast{topic: message.topic, event: @event_type, payload: payload} GenCounter.add(events_per_second_rate.id) TenantBroadcaster.pubsub_broadcast(tenant.external_id, tenant_topic, broadcast, RealtimeChannel.MessageDispatcher) diff --git a/lib/realtime/tenants/connect.ex b/lib/realtime/tenants/connect.ex index b9bf00eb4..3d8f39833 100644 --- a/lib/realtime/tenants/connect.ex +++ b/lib/realtime/tenants/connect.ex @@ -19,7 +19,6 @@ defmodule Realtime.Tenants.Connect do alias Realtime.Tenants.Connect.GetTenant alias Realtime.Tenants.Connect.Piper alias Realtime.Tenants.Connect.RegisterProcess - alias Realtime.Tenants.Connect.StartCounters alias Realtime.Tenants.Migrations alias Realtime.Tenants.ReplicationConnection alias Realtime.UsersCounter @@ -56,6 +55,7 @@ defmodule Realtime.Tenants.Connect do | {:error, :tenant_database_unavailable} | {:error, :initializing} | {:error, :tenant_database_connection_initializing} + | {:error, :tenant_db_too_many_connections} | {:error, :rpc_error, term()} def lookup_or_start_connection(tenant_id, opts \\ []) when is_binary(tenant_id) do case get_status(tenant_id) do @@ -63,13 +63,16 @@ defmodule Realtime.Tenants.Connect do {:ok, conn} {:error, :tenant_database_unavailable} -> - call_external_node(tenant_id, opts) + {:error, :tenant_database_unavailable} {:error, :tenant_database_connection_initializing} -> call_external_node(tenant_id, opts) {:error, :initializing} -> {:error, :tenant_database_unavailable} + + {:error, :tenant_db_too_many_connections} -> + {:error, :tenant_db_too_many_connections} end end @@ -81,16 +84,16 @@ defmodule Realtime.Tenants.Connect do | {:error, :tenant_database_unavailable} | {:error, :initializing} | {:error, :tenant_database_connection_initializing} + | {:error, :tenant_db_too_many_connections} def get_status(tenant_id) do case :syn.lookup(__MODULE__, tenant_id) do - {_pid, %{conn: nil}} -> - wait_for_connection(tenant_id) + {pid, %{conn: nil}} -> + wait_for_connection(pid, tenant_id) {_, %{conn: conn}} -> {:ok, conn} :undefined -> - Logger.warning("Connection process starting up") {:error, :tenant_database_connection_initializing} error -> @@ -101,7 +104,7 @@ defmodule Realtime.Tenants.Connect do def syn_topic(tenant_id), do: "connect:#{tenant_id}" - defp wait_for_connection(tenant_id) do + defp wait_for_connection(pid, tenant_id) do RealtimeWeb.Endpoint.subscribe(syn_topic(tenant_id)) # We do a lookup after subscribing because we could've missed a message while subscribing @@ -112,9 +115,18 @@ defmodule Realtime.Tenants.Connect do _ -> # Wait for up to 5 seconds for the ready event receive do - %{event: "ready", payload: %{conn: conn}} -> {:ok, conn} + %{event: "ready", payload: %{pid: ^pid, conn: conn}} -> + {:ok, conn} + + %{event: "connect_down", payload: %{pid: ^pid, reason: {:shutdown, :tenant_db_too_many_connections}}} -> + {:error, :tenant_db_too_many_connections} + + %{event: "connect_down", payload: %{pid: ^pid, reason: _reason}} -> + metadata = [external_id: tenant_id, project: tenant_id] + log_error("UnableToConnectToTenantDatabase", "Unable to connect to tenant database", metadata) + {:error, :tenant_database_unavailable} after - 5_000 -> {:error, :initializing} + 15_000 -> {:error, :initializing} end end after @@ -139,16 +151,6 @@ defmodule Realtime.Tenants.Connect do {:error, {:already_started, _}} -> get_status(tenant_id) - {:error, {:shutdown, :tenant_db_too_many_connections}} -> - {:error, :tenant_db_too_many_connections} - - {:error, {:shutdown, :tenant_not_found}} -> - {:error, :tenant_not_found} - - {:error, :shutdown} -> - log_error("UnableToConnectToTenantDatabase", "Unable to connect to tenant database", metadata) - {:error, :tenant_database_unavailable} - {:error, error} -> log_error("UnableToConnectToTenantDatabase", error, metadata) {:error, :tenant_database_unavailable} @@ -209,30 +211,33 @@ defmodule Realtime.Tenants.Connect do def init(%{tenant_id: tenant_id} = state) do Logger.metadata(external_id: tenant_id, project: tenant_id) + {:ok, state, {:continue, :db_connect}} + end + + @impl true + def handle_continue(:db_connect, state) do pipes = [ GetTenant, CheckConnection, - StartCounters, RegisterProcess ] case Piper.run(pipes, state) do {:ok, acc} -> - {:ok, acc, {:continue, :run_migrations}} + {:noreply, acc, {:continue, :run_migrations}} {:error, :tenant_not_found} -> - {:stop, {:shutdown, :tenant_not_found}} + {:stop, {:shutdown, :tenant_not_found}, state} {:error, :tenant_db_too_many_connections} -> - {:stop, {:shutdown, :tenant_db_too_many_connections}} + {:stop, {:shutdown, :tenant_db_too_many_connections}, state} {:error, error} -> log_error("UnableToConnectToTenantDatabase", error) - {:stop, :shutdown} + {:stop, :shutdown, state} end end - @impl true def handle_continue(:run_migrations, state) do %{tenant: tenant, db_conn_pid: db_conn_pid} = state Logger.warning("Tenant #{tenant.external_id} is initializing: #{inspect(node())}") @@ -252,31 +257,10 @@ defmodule Realtime.Tenants.Connect do end def handle_continue(:start_replication, state) do - %{tenant: tenant} = state - - with {:ok, replication_connection_pid} <- ReplicationConnection.start(tenant, self()) do - replication_connection_reference = Process.monitor(replication_connection_pid) - - state = %{ - state - | replication_connection_pid: replication_connection_pid, - replication_connection_reference: replication_connection_reference - } - - {:noreply, state, {:continue, :setup_connected_user_events}} - else - {:error, :max_wal_senders_reached} -> - log_error("ReplicationMaxWalSendersReached", "Tenant database has reached the maximum number of WAL senders") - {:stop, :shutdown, state} - - {:error, error} -> - log_error("StartReplicationFailed", error) - {:stop, :shutdown, state} + case start_replication_connection(state) do + {:ok, state} -> {:noreply, state, {:continue, :setup_connected_user_events}} + {:error, state} -> {:stop, :shutdown, state} end - rescue - error -> - log_error("StartReplicationFailed", error) - {:stop, :shutdown, state} end def handle_continue(:setup_connected_user_events, state) do @@ -348,13 +332,30 @@ defmodule Realtime.Tenants.Connect do {:stop, :shutdown, state} end + @replication_recovery_backoff 1000 + # Handle replication connection termination def handle_info( {:DOWN, replication_connection_reference, _, _, _}, %{replication_connection_reference: replication_connection_reference} = state ) do - Logger.warning("Replication connection has died") - {:stop, :shutdown, state} + log_warning("ReplicationConnectionDown", "Replication connection has been terminated") + Process.send_after(self(), :recover_replication_connection, @replication_recovery_backoff) + state = %{state | replication_connection_pid: nil, replication_connection_reference: nil} + {:noreply, state} + end + + @replication_connection_query "SELECT 1 from pg_stat_activity where application_name='realtime_replication_connection'" + def handle_info(:recover_replication_connection, state) do + with %{num_rows: 0} <- Postgrex.query!(state.db_conn_pid, @replication_connection_query, []), + {:ok, state} <- start_replication_connection(state) do + {:noreply, state} + else + _ -> + log_error("ReplicationConnectionRecoveryFailed", "Replication connection recovery failed") + Process.send_after(self(), :recover_replication_connection, @replication_recovery_backoff) + {:noreply, state} + end end def handle_info(_, state), do: {:noreply, state} @@ -375,6 +376,7 @@ defmodule Realtime.Tenants.Connect do ## Private functions defp call_external_node(tenant_id, opts) do + Logger.warning("Connection process starting up") rpc_timeout = Keyword.get(opts, :rpc_timeout, @rpc_timeout_default) with tenant <- Tenants.Cache.get_tenant_by_external_id(tenant_id), @@ -413,4 +415,32 @@ defmodule Realtime.Tenants.Connect do defp tenant_suspended?(_), do: :ok defp rebalance_check_interval_in_ms(), do: Application.fetch_env!(:realtime, :rebalance_check_interval_in_ms) + + defp start_replication_connection(state) do + %{tenant: tenant} = state + + with {:ok, replication_connection_pid} <- ReplicationConnection.start(tenant, self()) do + replication_connection_reference = Process.monitor(replication_connection_pid) + + state = %{ + state + | replication_connection_pid: replication_connection_pid, + replication_connection_reference: replication_connection_reference + } + + {:ok, state} + else + {:error, :max_wal_senders_reached} -> + log_error("ReplicationMaxWalSendersReached", "Tenant database has reached the maximum number of WAL senders") + {:error, state} + + {:error, error} -> + log_error("StartReplicationFailed", error) + {:error, state} + end + rescue + error -> + log_error("StartReplicationFailed", error) + {:error, state} + end end diff --git a/lib/realtime/tenants/connect/check_connection.ex b/lib/realtime/tenants/connect/check_connection.ex index 697c08b6c..53cd8e480 100644 --- a/lib/realtime/tenants/connect/check_connection.ex +++ b/lib/realtime/tenants/connect/check_connection.ex @@ -2,16 +2,14 @@ defmodule Realtime.Tenants.Connect.CheckConnection do @moduledoc """ Check tenant database connection. """ - alias Realtime.Database @behaviour Realtime.Tenants.Connect.Piper @impl true def run(acc) do %{tenant: tenant} = acc - case Database.check_tenant_connection(tenant) do + case Realtime.Database.check_tenant_connection(tenant) do {:ok, conn} -> - Process.link(conn) db_conn_reference = Process.monitor(conn) {:ok, %{acc | db_conn_pid: conn, db_conn_reference: db_conn_reference}} diff --git a/lib/realtime/tenants/connect/start_counters.ex b/lib/realtime/tenants/connect/start_counters.ex deleted file mode 100644 index f8ce6c378..000000000 --- a/lib/realtime/tenants/connect/start_counters.ex +++ /dev/null @@ -1,60 +0,0 @@ -defmodule Realtime.Tenants.Connect.StartCounters do - @moduledoc """ - Start tenant counters. - """ - - alias Realtime.RateCounter - alias Realtime.Tenants - - @behaviour Realtime.Tenants.Connect.Piper - - @impl true - def run(acc) do - %{tenant: tenant} = acc - - with :ok <- start_joins_per_second_counter(tenant), - :ok <- start_max_events_counter(tenant), - :ok <- start_db_events_counter(tenant) do - {:ok, acc} - end - end - - def start_joins_per_second_counter(tenant) do - res = - tenant - |> Tenants.joins_per_second_rate() - |> RateCounter.new() - - case res do - {:ok, _} -> :ok - {:error, {:already_started, _}} -> :ok - {:error, reason} -> {:error, reason} - end - end - - def start_max_events_counter(tenant) do - res = - tenant - |> Tenants.events_per_second_rate() - |> RateCounter.new() - - case res do - {:ok, _} -> :ok - {:error, {:already_started, _}} -> :ok - {:error, reason} -> {:error, reason} - end - end - - def start_db_events_counter(tenant) do - res = - tenant - |> Tenants.db_events_per_second_rate() - |> RateCounter.new() - - case res do - {:ok, _} -> :ok - {:error, {:already_started, _}} -> :ok - {:error, reason} -> {:error, reason} - end - end -end diff --git a/lib/realtime/tenants/migrations.ex b/lib/realtime/tenants/migrations.ex index 04475c2b7..a5fa1eb8b 100644 --- a/lib/realtime/tenants/migrations.ex +++ b/lib/realtime/tenants/migrations.ex @@ -74,7 +74,8 @@ defmodule Realtime.Tenants.Migrations do RealtimeSendSetsTopicConfig, SubscriptionIndexBridgingDisabled, RunSubscriptionIndexBridgingDisabled, - BroadcastSendErrorLogging + BroadcastSendErrorLogging, + CreateMessagesReplayIndex } @migrations [ @@ -140,7 +141,8 @@ defmodule Realtime.Tenants.Migrations do {20_250_128_220_012, RealtimeSendSetsTopicConfig}, {20_250_506_224_012, SubscriptionIndexBridgingDisabled}, {20_250_523_164_012, RunSubscriptionIndexBridgingDisabled}, - {20_250_714_121_412, BroadcastSendErrorLogging} + {20_250_714_121_412, BroadcastSendErrorLogging}, + {20_250_905_041_441, CreateMessagesReplayIndex} ] defstruct [:tenant_external_id, :settings] diff --git a/lib/realtime/tenants/replication_connection.ex b/lib/realtime/tenants/replication_connection.ex index 45e03c66e..4ebb1f8e8 100644 --- a/lib/realtime/tenants/replication_connection.ex +++ b/lib/realtime/tenants/replication_connection.ex @@ -144,8 +144,8 @@ defmodule Realtime.Tenants.ReplicationConnection do port: connection_opts.port, socket_options: connection_opts.socket_options, ssl: connection_opts.ssl, - backoff_type: :stop, sync_connect: true, + auto_reconnect: false, parameters: [application_name: "realtime_replication_connection"] ] @@ -310,7 +310,13 @@ defmodule Realtime.Tenants.ReplicationConnection do {:ok, topic} <- get_or_error(to_broadcast, "topic", :topic_missing), {:ok, private} <- get_or_error(to_broadcast, "private", :private_missing), %Tenant{} = tenant <- Cache.get_tenant_by_external_id(tenant_id), - broadcast_message = %{topic: topic, event: event, private: private, payload: Map.put_new(payload, "id", id)}, + broadcast_message = %{ + id: id, + topic: topic, + event: event, + private: private, + payload: Map.put_new(payload, "id", id) + }, :ok <- BatchBroadcast.broadcast(nil, tenant, %{messages: [broadcast_message]}, true) do inserted_at = NaiveDateTime.from_iso8601!(inserted_at) latency_inserted_at = NaiveDateTime.utc_now() |> NaiveDateTime.diff(inserted_at) diff --git a/lib/realtime/tenants/repo/migrations/20250905041441_create_messages_replay_index.ex b/lib/realtime/tenants/repo/migrations/20250905041441_create_messages_replay_index.ex new file mode 100644 index 000000000..77afde6e0 --- /dev/null +++ b/lib/realtime/tenants/repo/migrations/20250905041441_create_messages_replay_index.ex @@ -0,0 +1,11 @@ +defmodule Realtime.Tenants.Migrations.CreateMessagesReplayIndex do + @moduledoc false + + use Ecto.Migration + + def change do + create_if_not_exists index(:messages, [{:desc, :inserted_at}, :topic], + where: "extension = 'broadcast' and private IS TRUE" + ) + end +end diff --git a/lib/realtime_web/channels/payloads/broadcast.ex b/lib/realtime_web/channels/payloads/broadcast.ex index 7feddb043..e2881fd54 100644 --- a/lib/realtime_web/channels/payloads/broadcast.ex +++ b/lib/realtime_web/channels/payloads/broadcast.ex @@ -9,9 +9,11 @@ defmodule RealtimeWeb.Channels.Payloads.Broadcast do embedded_schema do field :ack, :boolean, default: false field :self, :boolean, default: false + embeds_one :replay, RealtimeWeb.Channels.Payloads.Broadcast.Replay end def changeset(broadcast, attrs) do cast(broadcast, attrs, [:ack, :self], message: &Join.error_message/2) + |> cast_embed(:replay, invalid_message: "unable to parse, expected a map") end end diff --git a/lib/realtime_web/channels/payloads/broadcast/replay.ex b/lib/realtime_web/channels/payloads/broadcast/replay.ex new file mode 100644 index 000000000..b0a5804a2 --- /dev/null +++ b/lib/realtime_web/channels/payloads/broadcast/replay.ex @@ -0,0 +1,17 @@ +defmodule RealtimeWeb.Channels.Payloads.Broadcast.Replay do + @moduledoc """ + Validate broadcast replay field of the join payload. + """ + use Ecto.Schema + import Ecto.Changeset + alias RealtimeWeb.Channels.Payloads.Join + + embedded_schema do + field :limit, :integer, default: 10 + field :since, :integer, default: 0 + end + + def changeset(broadcast, attrs) do + cast(broadcast, attrs, [:limit, :since], message: &Join.error_message/2) + end +end diff --git a/lib/realtime_web/channels/realtime_channel.ex b/lib/realtime_web/channels/realtime_channel.ex index 26c033f5c..1d58d9da7 100644 --- a/lib/realtime_web/channels/realtime_channel.ex +++ b/lib/realtime_web/channels/realtime_channel.ex @@ -72,12 +72,21 @@ defmodule RealtimeWeb.RealtimeChannel do {:ok, claims, confirm_token_ref} <- confirm_token(socket), socket = assign_authorization_context(socket, sub_topic, claims), {:ok, db_conn} <- Connect.lookup_or_start_connection(tenant_id), - {:ok, socket} <- maybe_assign_policies(sub_topic, db_conn, socket) do + {:ok, socket} <- maybe_assign_policies(sub_topic, db_conn, socket), + {:ok, replayed_message_ids} <- + maybe_replay_messages(params["config"], sub_topic, db_conn, socket.assigns.private?) do tenant_topic = Tenants.tenant_topic(tenant_id, sub_topic, !socket.assigns.private?) # fastlane subscription metadata = - MessageDispatcher.fastlane_metadata(transport_pid, serializer, topic, socket.assigns.log_level, tenant_id) + MessageDispatcher.fastlane_metadata( + transport_pid, + serializer, + topic, + log_level, + tenant_id, + replayed_message_ids + ) RealtimeWeb.Endpoint.subscribe(tenant_topic, metadata: metadata) @@ -198,6 +207,12 @@ defmodule RealtimeWeb.RealtimeChannel do {:error, :shutdown_in_progress} -> log_error(socket, "RealtimeRestarting", "Realtime is restarting, please standby") + {:error, :failed_to_replay_messages} -> + log_error(socket, "UnableToReplayMessages", "Realtime was unable to replay messages") + + {:error, :invalid_replay_params} -> + log_error(socket, "UnableToReplayMessages", "Replay params are not valid") + {:error, error} -> log_error(socket, "UnknownErrorOnChannel", error) {:error, %{reason: "Unknown Error on Channel"}} @@ -205,6 +220,17 @@ defmodule RealtimeWeb.RealtimeChannel do end @impl true + def handle_info({:replay, messages}, socket) do + for message <- messages do + meta = %{"replayed" => true, "id" => message.id} + payload = %{"payload" => message.payload, "event" => message.event, "type" => "broadcast", "meta" => meta} + + push(socket, "broadcast", payload) + end + + {:noreply, socket} + end + def handle_info(:update_rate_counter, socket) do count(socket) @@ -376,7 +402,7 @@ defmodule RealtimeWeb.RealtimeChannel do end def handle_in("presence", payload, %{assigns: %{private?: false}} = socket) do - with {:ok, socket} <- PresenceHandler.handle(payload, socket) do + with {:ok, socket} <- PresenceHandler.handle(payload, nil, socket) do {:reply, :ok, socket} else {:error, :rate_limit_exceeded} -> @@ -762,4 +788,25 @@ defmodule RealtimeWeb.RealtimeChannel do do: {:error, :private_only}, else: :ok end + + defp maybe_replay_messages(%{"broadcast" => %{"replay" => _}}, _sub_topic, _db_conn, false = _private?) do + {:error, :invalid_replay_params} + end + + defp maybe_replay_messages(%{"broadcast" => %{"replay" => replay_params}}, sub_topic, db_conn, true = _private?) + when is_map(replay_params) do + with {:ok, messages, message_ids} <- + Realtime.Messages.replay( + db_conn, + sub_topic, + replay_params["since"], + replay_params["limit"] || 25 + ) do + # Send to self because we can't write to the socket before finishing the join process + send(self(), {:replay, messages}) + {:ok, message_ids} + end + end + + defp maybe_replay_messages(_, _, _, _), do: {:ok, MapSet.new()} end diff --git a/lib/realtime_web/channels/realtime_channel/logging.ex b/lib/realtime_web/channels/realtime_channel/logging.ex index 296dce1bc..2f6c91fdb 100644 --- a/lib/realtime_web/channels/realtime_channel/logging.ex +++ b/lib/realtime_web/channels/realtime_channel/logging.ex @@ -21,7 +21,7 @@ defmodule RealtimeWeb.RealtimeChannel.Logging do def log_error(socket, code, msg) do msg = build_msg(code, msg) emit_system_error(:error, code) - log(socket, :error, msg) + log(socket, :error, code, msg) {:error, %{reason: msg}} end @@ -32,7 +32,7 @@ defmodule RealtimeWeb.RealtimeChannel.Logging do {:error, %{reason: binary}} def log_warning(socket, code, msg) do msg = build_msg(code, msg) - log(socket, :warning, msg) + log(socket, :warning, code, msg) {:error, %{reason: msg}} end @@ -59,16 +59,16 @@ defmodule RealtimeWeb.RealtimeChannel.Logging do if code, do: "#{code}: #{msg}", else: msg end - defp log(%{assigns: %{tenant: tenant, access_token: access_token}}, level, msg) do + defp log(%{assigns: %{tenant: tenant, access_token: access_token}}, level, code, msg) do Logger.metadata(external_id: tenant, project: tenant) if level in [:error, :warning], do: update_metadata_with_token_claims(access_token) - Logger.log(level, msg) + Logger.log(level, msg, error_code: code) end defp maybe_log(%{assigns: %{log_level: log_level}} = socket, level, code, msg) do msg = build_msg(code, msg) emit_system_error(level, code) - if Logger.compare_levels(log_level, level) != :gt, do: log(socket, level, msg) + if Logger.compare_levels(log_level, level) != :gt, do: log(socket, level, code, msg) if level in [:error, :warning], do: {:error, %{reason: msg}}, else: :ok end diff --git a/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex b/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex index b5db97f95..32e1528f3 100644 --- a/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex +++ b/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex @@ -5,12 +5,14 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do require Logger - def fastlane_metadata(fastlane_pid, serializer, topic, :info, tenant_id) do - {:realtime_channel_fastlane, fastlane_pid, serializer, topic, {:log, tenant_id}} + def fastlane_metadata(fastlane_pid, serializer, topic, log_level, tenant_id, replayed_message_ids \\ MapSet.new()) + + def fastlane_metadata(fastlane_pid, serializer, topic, :info, tenant_id, replayed_message_ids) do + {:rc_fastlane, fastlane_pid, serializer, topic, {:log, tenant_id}, replayed_message_ids} end - def fastlane_metadata(fastlane_pid, serializer, topic, _log_level, _tenant_id) do - {:realtime_channel_fastlane, fastlane_pid, serializer, topic} + def fastlane_metadata(fastlane_pid, serializer, topic, _log_level, _tenant_id, replayed_message_ids) do + {:rc_fastlane, fastlane_pid, serializer, topic, replayed_message_ids} end @doc """ @@ -23,22 +25,34 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do # This reduce caches the serialization and bypasses the channel process going straight to the # transport process + message_id = message_id(msg.payload) + # Credo doesn't like that we don't use the result aggregation _ = Enum.reduce(subscribers, %{}, fn {pid, _}, cache when pid == from -> cache - {pid, {:realtime_channel_fastlane, fastlane_pid, serializer, join_topic}}, cache -> - send(pid, :update_rate_counter) - do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + {pid, {:rc_fastlane, fastlane_pid, serializer, join_topic, replayed_message_ids}}, cache -> + if already_replayed?(message_id, replayed_message_ids) do + # skip already replayed message + cache + else + send(pid, :update_rate_counter) + do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + end - {pid, {:realtime_channel_fastlane, fastlane_pid, serializer, join_topic, {:log, tenant_id}}}, cache -> - send(pid, :update_rate_counter) - log = "Received message on #{join_topic} with payload: #{inspect(msg, pretty: true)}" - Logger.info(log, external_id: tenant_id, project: tenant_id) + {pid, {:rc_fastlane, fastlane_pid, serializer, join_topic, {:log, tenant_id}, replayed_message_ids}}, cache -> + if already_replayed?(message_id, replayed_message_ids) do + # skip already replayed message + cache + else + send(pid, :update_rate_counter) + log = "Received message on #{join_topic} with payload: #{inspect(msg, pretty: true)}" + Logger.info(log, external_id: tenant_id, project: tenant_id) - do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + end {pid, _}, cache -> send(pid, msg) @@ -48,6 +62,12 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do :ok end + defp message_id(%{"meta" => %{"id" => id}}), do: id + defp message_id(_), do: nil + + defp already_replayed?(nil, _replayed_message_ids), do: false + defp already_replayed?(message_id, replayed_message_ids), do: MapSet.member?(replayed_message_ids, message_id) + defp do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) do case cache do %{^serializer => encoded_msg} -> diff --git a/lib/realtime_web/channels/realtime_channel/presence_handler.ex b/lib/realtime_web/channels/realtime_channel/presence_handler.ex index 00ce77c02..9dc23d219 100644 --- a/lib/realtime_web/channels/realtime_channel/presence_handler.ex +++ b/lib/realtime_web/channels/realtime_channel/presence_handler.ex @@ -52,28 +52,22 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do end end - @spec handle(map(), Socket.t()) :: - {:ok, Socket.t()} | {:error, :rls_policy_error | :unable_to_set_policies | :rate_limit_exceeded} - def handle(_, %{assigns: %{presence_enabled?: false}} = socket), do: {:ok, socket} - def handle(payload, socket) when not is_private?(socket), do: handle(payload, nil, socket) - @spec handle(map(), pid() | nil, Socket.t()) :: {:ok, Socket.t()} | {:error, :rls_policy_error | :unable_to_set_policies | :rate_limit_exceeded | :unable_to_track_presence} - def handle(_, _, %{assigns: %{presence_enabled?: false}} = socket), do: {:ok, socket} - def handle(%{"event" => event} = payload, db_conn, socket) do event = String.downcase(event, :ascii) handle_presence_event(event, payload, db_conn, socket) end - def handle(_payload, _db_conn, socket), do: {:ok, socket} + def handle(_, _, socket), do: {:ok, socket} - defp handle_presence_event("track", payload, _db_conn, socket) when not is_private?(socket) do + defp handle_presence_event("track", payload, _, socket) when not is_private?(socket) do track(socket, payload) end - defp handle_presence_event("track", payload, db_conn, socket) when is_nil(socket.assigns.policies.presence.write) do + defp handle_presence_event("track", payload, db_conn, socket) + when is_private?(socket) and is_nil(socket.assigns.policies.presence.write) do %{assigns: %{authorization_context: authorization_context, policies: policies}} = socket case Authorization.get_write_authorizations(policies, db_conn, authorization_context) do @@ -111,6 +105,8 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do end defp track(socket, payload) do + socket = assign(socket, :presence_enabled?, true) + %{assigns: %{presence_key: presence_key, tenant_topic: tenant_topic}} = socket payload = Map.get(payload, "payload", %{}) diff --git a/lib/realtime_web/channels/user_socket.ex b/lib/realtime_web/channels/user_socket.ex index 09dd15906..849aa052d 100644 --- a/lib/realtime_web/channels/user_socket.ex +++ b/lib/realtime_web/channels/user_socket.ex @@ -1,4 +1,12 @@ defmodule RealtimeWeb.UserSocket do + # This is defined up here before `use Phoenix.Socket` is called so that we can define `Phoenix.Socket.init/1` + # It has to be overridden because we need to set the `max_heap_size` flag from the transport process context + @impl true + def init(state) when is_tuple(state) do + Process.flag(:max_heap_size, max_heap_size()) + Phoenix.Socket.__init__(state) + end + use Phoenix.Socket use Realtime.Logs @@ -122,4 +130,6 @@ defmodule RealtimeWeb.UserSocket do _ -> @default_log_level end end + + defp max_heap_size(), do: Application.fetch_env!(:realtime, :websocket_max_heap_size) end diff --git a/lib/realtime_web/endpoint.ex b/lib/realtime_web/endpoint.ex index 917ab65b9..190e1a917 100644 --- a/lib/realtime_web/endpoint.ex +++ b/lib/realtime_web/endpoint.ex @@ -16,6 +16,15 @@ defmodule RealtimeWeb.Endpoint do connect_info: [:peer_data, :uri, :x_headers], fullsweep_after: 20, max_frame_size: 8_000_000, + # https://github.com/ninenines/cowboy/blob/24d32de931a0c985ff7939077463fc8be939f0e9/doc/src/manual/cowboy_websocket.asciidoc#L228 + # active_n: The number of packets Cowboy will request from the socket at once. + # This can be used to tweak the performance of the server. Higher values reduce + # the number of times Cowboy need to request more packets from the port driver at + # the expense of potentially higher memory being used. + active_n: 100, + # Skip validating UTF8 for faster frame processing. + # Currently all text frames as handled only with JSON which already requires UTF-8 + validate_utf8: false, serializer: [ {Phoenix.Socket.V1.JSONSerializer, "~> 1.0.0"}, {Phoenix.Socket.V2.JSONSerializer, "~> 2.0.0"} diff --git a/lib/realtime_web/tenant_broadcaster.ex b/lib/realtime_web/tenant_broadcaster.ex index ee8646614..da02df79e 100644 --- a/lib/realtime_web/tenant_broadcaster.ex +++ b/lib/realtime_web/tenant_broadcaster.ex @@ -9,7 +9,11 @@ defmodule RealtimeWeb.TenantBroadcaster do def pubsub_broadcast(tenant_id, topic, message, dispatcher) do collect_payload_size(tenant_id, message) - Realtime.GenRpc.multicast(PubSub, :local_broadcast, [Realtime.PubSub, topic, message, dispatcher], key: topic) + if pubsub_adapter() == :gen_rpc do + PubSub.broadcast(Realtime.PubSub, topic, message, dispatcher) + else + Realtime.GenRpc.multicast(PubSub, :local_broadcast, [Realtime.PubSub, topic, message, dispatcher], key: topic) + end :ok end @@ -25,12 +29,16 @@ defmodule RealtimeWeb.TenantBroadcaster do def pubsub_broadcast_from(tenant_id, from, topic, message, dispatcher) do collect_payload_size(tenant_id, message) - Realtime.GenRpc.multicast( - PubSub, - :local_broadcast_from, - [Realtime.PubSub, from, topic, message, dispatcher], - key: topic - ) + if pubsub_adapter() == :gen_rpc do + PubSub.broadcast_from(Realtime.PubSub, from, topic, message, dispatcher) + else + Realtime.GenRpc.multicast( + PubSub, + :local_broadcast_from, + [Realtime.PubSub, from, topic, message, dispatcher], + key: topic + ) + end :ok end @@ -45,4 +53,8 @@ defmodule RealtimeWeb.TenantBroadcaster do defp collect_payload_size(tenant_id, payload) do :telemetry.execute(@payload_size_event, %{size: :erlang.external_size(payload)}, %{tenant: tenant_id}) end + + defp pubsub_adapter do + Application.fetch_env!(:realtime, :pubsub_adapter) + end end diff --git a/mix.exs b/mix.exs index c0d4e1516..9c66b3dde 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.46.4", + version: "2.51.3", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, @@ -90,7 +90,7 @@ defmodule Realtime.MixProject do {:opentelemetry_phoenix, "~> 2.0"}, {:opentelemetry_cowboy, "~> 1.0"}, {:opentelemetry_ecto, "~> 1.2"}, - {:gen_rpc, git: "https://github.com/supabase/gen_rpc.git", ref: "d161cf263c661a534eaabf80aac7a34484dac772"}, + {:gen_rpc, git: "https://github.com/supabase/gen_rpc.git", ref: "901aada9adb307ff89a8be197a5d384e69dd57d6"}, {:mimic, "~> 1.0", only: :test}, {:floki, ">= 0.30.0", only: :test}, {:mint_web_socket, "~> 1.0", only: :test}, diff --git a/mix.lock b/mix.lock index 76eb0d980..c5fce6022 100644 --- a/mix.lock +++ b/mix.lock @@ -7,9 +7,9 @@ "castore": {:hex, :castore, "1.0.11", "4bbd584741601eb658007339ea730b082cc61f3554cf2e8f39bf693a11b49073", [:mix], [], "hexpm", "e03990b4db988df56262852f20de0f659871c35154691427a5047f4967a16a62"}, "chatterbox": {:hex, :ts_chatterbox, "0.15.1", "5cac4d15dd7ad61fc3c4415ce4826fc563d4643dee897a558ec4ea0b1c835c9c", [:rebar3], [{:hpack, "~> 0.3.0", [hex: :hpack_erl, repo: "hexpm", optional: false]}], "hexpm", "4f75b91451338bc0da5f52f3480fa6ef6e3a2aeecfc33686d6b3d0a0948f31aa"}, "corsica": {:hex, :corsica, "2.1.3", "dccd094ffce38178acead9ae743180cdaffa388f35f0461ba1e8151d32e190e6", [:mix], [{:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "616c08f61a345780c2cf662ff226816f04d8868e12054e68963e95285b5be8bc"}, - "cowboy": {:hex, :cowboy, "2.12.0", "f276d521a1ff88b2b9b4c54d0e753da6c66dd7be6c9fca3d9418b561828a3731", [:make, :rebar3], [{:cowlib, "2.13.0", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "8a7abe6d183372ceb21caa2709bec928ab2b72e18a3911aa1771639bef82651e"}, + "cowboy": {:hex, :cowboy, "2.13.0", "09d770dd5f6a22cc60c071f432cd7cb87776164527f205c5a6b0f24ff6b38990", [:make, :rebar3], [{:cowlib, ">= 2.14.0 and < 3.0.0", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, ">= 1.8.0 and < 3.0.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "e724d3a70995025d654c1992c7b11dbfea95205c047d86ff9bf1cda92ddc5614"}, "cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"}, - "cowlib": {:hex, :cowlib, "2.13.0", "db8f7505d8332d98ef50a3ef34b34c1afddec7506e4ee4dd4a3a266285d282ca", [:make, :rebar3], [], "hexpm", "e1e1284dc3fc030a64b1ad0d8382ae7e99da46c3246b815318a4b848873800a4"}, + "cowlib": {:hex, :cowlib, "2.15.0", "3c97a318a933962d1c12b96ab7c1d728267d2c523c25a5b57b0f93392b6e9e25", [:make, :rebar3], [], "hexpm", "4f00c879a64b4fe7c8fcb42a4281925e9ffdb928820b03c3ad325a617e857532"}, "credo": {:hex, :credo, "1.7.11", "d3e805f7ddf6c9c854fd36f089649d7cf6ba74c42bc3795d587814e3c9847102", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "56826b4306843253a66e47ae45e98e7d284ee1f95d53d1612bb483f88a8cf219"}, "ctx": {:hex, :ctx, "0.6.0", "8ff88b70e6400c4df90142e7f130625b82086077a45364a78d208ed3ed53c7fe", [:rebar3], [], "hexpm", "a14ed2d1b67723dbebbe423b28d7615eb0bdcba6ff28f2d1f1b0a7e1d4aa5fc2"}, "db_connection": {:hex, :db_connection, "2.8.0", "64fd82cfa6d8e25ec6660cea73e92a4cbc6a18b31343910427b702838c4b33b2", [:mix], [{:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "008399dae5eee1bf5caa6e86d204dcb44242c82b1ed5e22c881f2c34da201b15"}, @@ -29,7 +29,7 @@ "file_system": {:hex, :file_system, "1.1.0", "08d232062284546c6c34426997dd7ef6ec9f8bbd090eb91780283c9016840e8f", [:mix], [], "hexpm", "bfcf81244f416871f2a2e15c1b515287faa5db9c6bcf290222206d120b3d43f6"}, "finch": {:hex, :finch, "0.19.0", "c644641491ea854fc5c1bbaef36bfc764e3f08e7185e1f084e35e0672241b76d", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "fc5324ce209125d1e2fa0fcd2634601c52a787aff1cd33ee833664a5af4ea2b6"}, "floki": {:hex, :floki, "0.37.0", "b83e0280bbc6372f2a403b2848013650b16640cd2470aea6701f0632223d719e", [:mix], [], "hexpm", "516a0c15a69f78c47dc8e0b9b3724b29608aa6619379f91b1ffa47109b5d0dd3"}, - "gen_rpc": {:git, "https://github.com/supabase/gen_rpc.git", "d161cf263c661a534eaabf80aac7a34484dac772", [ref: "d161cf263c661a534eaabf80aac7a34484dac772"]}, + "gen_rpc": {:git, "https://github.com/supabase/gen_rpc.git", "901aada9adb307ff89a8be197a5d384e69dd57d6", [ref: "901aada9adb307ff89a8be197a5d384e69dd57d6"]}, "gettext": {:hex, :gettext, "0.26.2", "5978aa7b21fada6deabf1f6341ddba50bc69c999e812211903b169799208f2a8", [:mix], [{:expo, "~> 0.5.1 or ~> 1.0", [hex: :expo, repo: "hexpm", optional: false]}], "hexpm", "aa978504bcf76511efdc22d580ba08e2279caab1066b76bb9aa81c4a1e0a32a5"}, "gproc": {:hex, :gproc, "0.9.1", "f1df0364423539cf0b80e8201c8b1839e229e5f9b3ccb944c5834626998f5b8c", [:rebar3], [], "hexpm", "905088e32e72127ed9466f0bac0d8e65704ca5e73ee5a62cb073c3117916d507"}, "grpcbox": {:hex, :grpcbox, "0.17.1", "6e040ab3ef16fe699ffb513b0ef8e2e896da7b18931a1ef817143037c454bcce", [:rebar3], [{:acceptor_pool, "~> 1.0.0", [hex: :acceptor_pool, repo: "hexpm", optional: false]}, {:chatterbox, "~> 0.15.1", [hex: :ts_chatterbox, repo: "hexpm", optional: false]}, {:ctx, "~> 0.6.0", [hex: :ctx, repo: "hexpm", optional: false]}, {:gproc, "~> 0.9.1", [hex: :gproc, repo: "hexpm", optional: false]}], "hexpm", "4a3b5d7111daabc569dc9cbd9b202a3237d81c80bf97212fbc676832cb0ceb17"}, @@ -82,7 +82,7 @@ "postgres_replication": {:git, "https://github.com/filipecabaco/postgres_replication.git", "69129221f0263aa13faa5fbb8af97c28aeb4f71c", []}, "postgrex": {:hex, :postgrex, "0.20.0", "363ed03ab4757f6bc47942eff7720640795eb557e1935951c1626f0d303a3aed", [:mix], [{:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "d36ef8b36f323d29505314f704e21a1a038e2dc387c6409ee0cd24144e187c0f"}, "prom_ex": {:hex, :prom_ex, "1.9.0", "63e6dda6c05cdeec1f26c48443dcc38ffd2118b3665ae8d2bd0e5b79f2aea03e", [:mix], [{:absinthe, ">= 1.6.0", [hex: :absinthe, repo: "hexpm", optional: true]}, {:broadway, ">= 1.0.2", [hex: :broadway, repo: "hexpm", optional: true]}, {:ecto, ">= 3.5.0", [hex: :ecto, repo: "hexpm", optional: true]}, {:finch, "~> 0.15", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.2", [hex: :jason, repo: "hexpm", optional: false]}, {:oban, ">= 2.4.0", [hex: :oban, repo: "hexpm", optional: true]}, {:octo_fetch, "~> 0.3", [hex: :octo_fetch, repo: "hexpm", optional: false]}, {:phoenix, ">= 1.5.0", [hex: :phoenix, repo: "hexpm", optional: true]}, {:phoenix_live_view, ">= 0.14.0", [hex: :phoenix_live_view, repo: "hexpm", optional: true]}, {:plug, ">= 1.12.1", [hex: :plug, repo: "hexpm", optional: true]}, {:plug_cowboy, "~> 2.5", [hex: :plug_cowboy, repo: "hexpm", optional: false]}, {:telemetry, ">= 1.0.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:telemetry_metrics, "~> 0.6", [hex: :telemetry_metrics, repo: "hexpm", optional: false]}, {:telemetry_metrics_prometheus_core, "~> 1.0", [hex: :telemetry_metrics_prometheus_core, repo: "hexpm", optional: false]}, {:telemetry_poller, "~> 1.0", [hex: :telemetry_poller, repo: "hexpm", optional: false]}], "hexpm", "01f3d4f69ec93068219e686cc65e58a29c42bea5429a8ff4e2121f19db178ee6"}, - "ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"}, + "ranch": {:hex, :ranch, "2.2.0", "25528f82bc8d7c6152c57666ca99ec716510fe0925cb188172f41ce93117b1b0", [:make, :rebar3], [], "hexpm", "fa0b99a1780c80218a4197a59ea8d3bdae32fbff7e88527d7d8a4787eff4f8e7"}, "recon": {:hex, :recon, "2.5.6", "9052588e83bfedfd9b72e1034532aee2a5369d9d9343b61aeb7fbce761010741", [:mix, :rebar3], [], "hexpm", "96c6799792d735cc0f0fd0f86267e9d351e63339cbe03df9d162010cefc26bb0"}, "req": {:hex, :req, "0.5.10", "a3a063eab8b7510785a467f03d30a8d95f66f5c3d9495be3474b61459c54376c", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "8a604815743f8a2d3b5de0659fa3137fa4b1cffd636ecb69b30b2b9b2c2559be"}, "sleeplocks": {:hex, :sleeplocks, "1.1.3", "96a86460cc33b435c7310dbd27ec82ca2c1f24ae38e34f8edde97f756503441a", [:rebar3], [], "hexpm", "d3b3958552e6eb16f463921e70ae7c767519ef8f5be46d7696cc1ed649421321"}, diff --git a/rel/vm.args.eex b/rel/vm.args.eex index 278da5524..9de4e952f 100644 --- a/rel/vm.args.eex +++ b/rel/vm.args.eex @@ -10,8 +10,8 @@ ## Tweak GC to run more often ##-env ERL_FULLSWEEP_AFTER 10 -## Limit process heap for all procs to 1000 MB -+hmax 1000000000 +## Limit process heap for all procs to 500 MB. The number here is the number of words ++hmax <%= div(500_000_000, :erlang.system_info(:wordsize)) %> ## Set distribution buffer busy limit (default is 1024) +zdbbl 100000 @@ -19,4 +19,4 @@ ## Disable Busy Wait +sbwt none +sbwtdio none -+sbwtdcpu none \ No newline at end of file ++sbwtdcpu none diff --git a/test/integration/rt_channel_test.exs b/test/integration/rt_channel_test.exs index 806a5ad7e..23b1a3a7f 100644 --- a/test/integration/rt_channel_test.exs +++ b/test/integration/rt_channel_test.exs @@ -25,6 +25,7 @@ defmodule Realtime.Integration.RtChannelTest do alias Realtime.Tenants alias Realtime.Tenants.Authorization alias Realtime.Tenants.Connect + alias Realtime.Tenants.ReplicationConnection alias RealtimeWeb.RealtimeChannel.Tracker alias RealtimeWeb.SocketDisconnect @@ -653,8 +654,8 @@ defmodule Realtime.Integration.RtChannelTest do :syn.update_registry(Connect, tenant.external_id, fn _pid, meta -> %{meta | conn: nil} end) payload = %{"event" => "TEST", "payload" => %{"msg" => 1}, "type" => "broadcast"} WebsocketClient.send_event(service_role_socket, topic, "broadcast", payload) - # Waiting more than 5 seconds as this is the amount of time we will wait for the Connection to be ready - refute_receive %Message{event: "broadcast", payload: ^payload, topic: ^topic}, 6000 + # Waiting more than 15 seconds as this is the amount of time we will wait for the Connection to be ready + refute_receive %Message{event: "broadcast", payload: ^payload, topic: ^topic}, 16000 end) assert log =~ "UnableToHandleBroadcast" @@ -831,7 +832,7 @@ defmodule Realtime.Integration.RtChannelTest do refute_receive %Message{event: "presence_diff"}, 500 # Waiting more than 5 seconds as this is the amount of time we will wait for the Connection to be ready - refute_receive %Message{event: "phx_leave", topic: ^topic}, 6000 + refute_receive %Message{event: "phx_leave", topic: ^topic}, 16000 end) assert log =~ "UnableToHandlePresence" @@ -909,6 +910,56 @@ defmodule Realtime.Integration.RtChannelTest do assert_receive %Message{event: "phx_reply", payload: %{"status" => "ok"}}, 500 refute_receive %Message{event: "presence_state"}, 500 end + + test "presence automatically enabled when user sends track message for public channel", %{tenant: tenant} do + {socket, _} = get_connection(tenant) + config = %{presence: %{key: "", enabled: false}, private: false} + topic = "realtime:any" + + WebsocketClient.join(socket, topic, %{config: config}) + + assert_receive %Message{event: "phx_reply", payload: %{"status" => "ok"}, topic: ^topic}, 300 + refute_receive %Message{event: "presence_state"}, 500 + + payload = %{ + type: "presence", + event: "TRACK", + payload: %{name: "realtime_presence_96", t: 1814.7000000029802} + } + + WebsocketClient.send_event(socket, topic, "presence", payload) + + assert_receive %Message{event: "presence_diff", payload: %{"joins" => joins, "leaves" => %{}}, topic: ^topic} + + join_payload = joins |> Map.values() |> hd() |> get_in(["metas"]) |> hd() + assert get_in(join_payload, ["name"]) == payload.payload.name + assert get_in(join_payload, ["t"]) == payload.payload.t + end + + @tag policies: [:authenticated_read_broadcast_and_presence, :authenticated_write_broadcast_and_presence] + test "presence automatically enabled when user sends track message for private channel", + %{tenant: tenant, topic: topic} do + {socket, _} = get_connection(tenant, "authenticated") + config = %{presence: %{key: "", enabled: false}, private: true} + topic = "realtime:#{topic}" + + WebsocketClient.join(socket, topic, %{config: config}) + assert_receive %Message{event: "phx_reply", payload: %{"status" => "ok"}, topic: ^topic}, 300 + refute_receive %Message{event: "presence_state"}, 500 + + payload = %{ + type: "presence", + event: "TRACK", + payload: %{name: "realtime_presence_96", t: 1814.7000000029802} + } + + WebsocketClient.send_event(socket, topic, "presence", payload) + + assert_receive %Message{event: "presence_diff", payload: %{"joins" => joins, "leaves" => %{}}, topic: ^topic}, 500 + join_payload = joins |> Map.values() |> hd() |> get_in(["metas"]) |> hd() + assert get_in(join_payload, ["name"]) == payload.payload.name + assert get_in(join_payload, ["t"]) == payload.payload.t + end end describe "token handling" do @@ -2304,6 +2355,135 @@ defmodule Realtime.Integration.RtChannelTest do assert count == 2 end + describe "WAL bloat handling" do + setup %{tenant: tenant} do + topic = random_string() + {:ok, db_conn} = Database.connect(tenant, "realtime_test", :stop) + + %{rows: [[max_wal_size]]} = Postgrex.query!(db_conn, "SHOW max_wal_size", []) + %{rows: [[wal_keep_size]]} = Postgrex.query!(db_conn, "SHOW wal_keep_size", []) + %{rows: [[max_slot_wal_keep_size]]} = Postgrex.query!(db_conn, "SHOW max_slot_wal_keep_size", []) + + assert max_wal_size == "32MB" + assert wal_keep_size == "32MB" + assert max_slot_wal_keep_size == "32MB" + + Postgrex.query!(db_conn, "CREATE TABLE IF NOT EXISTS wal_test (id INT, data TEXT)", []) + + Postgrex.query!( + db_conn, + """ + CREATE OR REPLACE FUNCTION wal_test_trigger_func() RETURNS TRIGGER AS $$ + BEGIN + PERFORM realtime.send(json_build_object ('value', 'test' :: text)::jsonb, 'test', '#{topic}', false); + RETURN NULL; + END; + $$ LANGUAGE plpgsql; + """, + [] + ) + + Postgrex.query!(db_conn, "DROP TRIGGER IF EXISTS wal_test_trigger ON wal_test", []) + + Postgrex.query!( + db_conn, + """ + CREATE TRIGGER wal_test_trigger + AFTER INSERT OR UPDATE OR DELETE ON wal_test + FOR EACH ROW + EXECUTE FUNCTION wal_test_trigger_func() + """, + [] + ) + + GenServer.stop(db_conn) + + on_exit(fn -> + {:ok, db_conn} = Database.connect(tenant, "realtime_test", :stop) + + Postgrex.query!(db_conn, "DROP TABLE IF EXISTS wal_test CASCADE", []) + end) + + %{topic: topic} + end + + test "track PID changes during WAL bloat creation", %{tenant: tenant, topic: topic} do + {socket, _} = get_connection(tenant, "authenticated") + config = %{broadcast: %{self: true}, private: false} + full_topic = "realtime:#{topic}" + + active_slot_query = + "SELECT active_pid FROM pg_replication_slots where active_pid is not null and slot_name = 'supabase_realtime_messages_replication_slot_'" + + WebsocketClient.join(socket, full_topic, %{config: config}) + + assert_receive %Message{event: "phx_reply", payload: %{"status" => "ok"}}, 500 + assert_receive %Message{event: "presence_state"}, 500 + + assert Connect.ready?(tenant.external_id) + + {:ok, db_conn} = Connect.lookup_or_start_connection(tenant.external_id) + + original_connect_pid = Connect.whereis(tenant.external_id) + original_replication_pid = ReplicationConnection.whereis(tenant.external_id) + %{rows: [[original_db_pid]]} = Postgrex.query!(db_conn, active_slot_query, []) + + tasks = + for _ <- 1..5 do + Task.async(fn -> + {:ok, bloat_conn} = Database.connect(tenant, "realtime_bloat", :stop) + + Postgrex.transaction(bloat_conn, fn conn -> + Postgrex.query(conn, "INSERT INTO wal_test SELECT generate_series(1, 100000), repeat('x', 2000)", []) + {:error, "test"} + end) + + Process.exit(bloat_conn, :normal) + end) + end + + Task.await_many(tasks, 20000) + + # Kill all pending transactions still running + Postgrex.query!( + db_conn, + "SELECT pg_terminate_backend(pid) from pg_stat_activity where application_name='realtime_bloat'", + [] + ) + + # Does it recover? + assert Connect.ready?(tenant.external_id) + {:ok, db_conn} = Connect.lookup_or_start_connection(tenant.external_id) + Process.sleep(1000) + %{rows: [[new_db_pid]]} = Postgrex.query!(db_conn, active_slot_query, []) + + assert new_db_pid != original_db_pid + assert ^original_connect_pid = Connect.whereis(tenant.external_id) + assert original_replication_pid != ReplicationConnection.whereis(tenant.external_id) + + # Check if socket is still connected + payload = %{"event" => "TEST", "payload" => %{"msg" => 1}, "type" => "broadcast"} + WebsocketClient.send_event(socket, full_topic, "broadcast", payload) + assert_receive %Message{event: "broadcast", payload: ^payload, topic: ^full_topic}, 500 + + # Check if we are receiving the message from replication connection + Postgrex.query!(db_conn, "INSERT INTO wal_test VALUES (1, 'test')", []) + + assert_receive %Phoenix.Socket.Message{ + event: "broadcast", + payload: %{ + "event" => "test", + "payload" => %{"value" => "test"}, + "type" => "broadcast" + }, + join_ref: nil, + ref: nil, + topic: ^full_topic + }, + 5000 + end + end + defp mode(%{mode: :distributed}) do tenant = Api.get_tenant_by_external_id("dev_tenant") diff --git a/test/realtime/gen_rpc_pub_sub_test.exs b/test/realtime/gen_rpc_pub_sub_test.exs new file mode 100644 index 000000000..0013c2e7b --- /dev/null +++ b/test/realtime/gen_rpc_pub_sub_test.exs @@ -0,0 +1,2 @@ +Application.put_env(:phoenix_pubsub, :test_adapter, {Realtime.GenRpcPubSub, []}) +Code.require_file("../../deps/phoenix_pubsub/test/shared/pubsub_test.exs", __DIR__) diff --git a/test/realtime/gen_rpc_test.exs b/test/realtime/gen_rpc_test.exs index dd837aaf8..0c41d3ea1 100644 --- a/test/realtime/gen_rpc_test.exs +++ b/test/realtime/gen_rpc_test.exs @@ -172,6 +172,51 @@ defmodule Realtime.GenRpcTest do mechanism: :gen_rpc }} end + + test "bad node" do + node = :"unknown@1.1.1.1" + + log = + capture_log(fn -> + assert GenRpc.call(node, Map, :fetch, [%{a: 1}, :a], tenant_id: 123) == {:error, :rpc_error, :badnode} + end) + + assert log =~ + ~r/project=123 external_id=123 \[error\] ErrorOnRpcCall: %{+error: :badnode, mod: Map, func: :fetch, target: :"#{node}"/ + end + end + + describe "abcast/4" do + test "abcast to registered process", %{node: node} do + name = + System.unique_integer() + |> to_string() + |> String.to_atom() + + :erlang.register(name, self()) + + # Use erpc to make the other node abcast to this one + :erpc.call(node, GenRpc, :abcast, [[node()], name, "a message", []]) + + assert_receive "a message" + refute_receive _any + end + + @tag extra_config: [{:gen_rpc, :tcp_server_port, 9999}] + test "tcp error" do + Logger.put_process_level(self(), :debug) + + log = + capture_log(fn -> + assert GenRpc.abcast(Node.list(), :some_process_name, "a message", []) == :ok + # We have to wait for gen_rpc logs to show up + Process.sleep(100) + end) + + assert log =~ "[error] event=connect_to_remote_server" + + refute_receive _any + end end describe "multicast/4" do diff --git a/test/realtime/messages_test.exs b/test/realtime/messages_test.exs index 3bef9a5e0..cca0ce742 100644 --- a/test/realtime/messages_test.exs +++ b/test/realtime/messages_test.exs @@ -16,32 +16,221 @@ defmodule Realtime.MessagesTest do %{conn: conn, tenant: tenant, date_start: date_start, date_end: date_end} end - test "delete_old_messages/1 deletes messages older than 72 hours", %{ - conn: conn, - tenant: tenant, - date_start: date_start, - date_end: date_end - } do - utc_now = NaiveDateTime.utc_now() - limit = NaiveDateTime.add(utc_now, -72, :hour) - - messages = - for date <- Date.range(date_start, date_end) do - inserted_at = date |> NaiveDateTime.new!(Time.new!(0, 0, 0)) - message_fixture(tenant, %{inserted_at: inserted_at}) + describe "replay/5" do + test "invalid replay params" do + assert Messages.replay(self(), "a topic", "not a number", 123) == + {:error, :invalid_replay_params} + + assert Messages.replay(self(), "a topic", 123, "not a number") == + {:error, :invalid_replay_params} + + assert Messages.replay(self(), "a topic", 253_402_300_800_000, 10) == + {:error, :invalid_replay_params} + end + + test "empty replay", %{conn: conn} do + assert Messages.replay(conn, "test", 0, 10) == {:ok, [], MapSet.new()} + end + + test "replay respects limit", %{conn: conn, tenant: tenant} do + m1 = + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :minute), + "event" => "new", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "new"} + }) + + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-2, :minute), + "event" => "old", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "old"} + }) + + assert Messages.replay(conn, "test", 0, 1) == {:ok, [m1], MapSet.new([m1.id])} + end + + test "replay private topic only", %{conn: conn, tenant: tenant} do + privatem = + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :minute), + "event" => "new", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "new"} + }) + + message_fixture(tenant, %{ + "private" => false, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-2, :minute), + "event" => "old", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "old"} + }) + + assert Messages.replay(conn, "test", 0, 10) == {:ok, [privatem], MapSet.new([privatem.id])} + end + + test "replay extension=broadcast", %{conn: conn, tenant: tenant} do + privatem = + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :minute), + "event" => "new", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "new"} + }) + + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-2, :minute), + "event" => "old", + "extension" => "presence", + "topic" => "test", + "payload" => %{"value" => "old"} + }) + + assert Messages.replay(conn, "test", 0, 10) == {:ok, [privatem], MapSet.new([privatem.id])} + end + + test "replay respects since", %{conn: conn, tenant: tenant} do + m1 = + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-2, :minute), + "event" => "first", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "first"} + }) + + m2 = + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :minute), + "event" => "second", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "second"} + }) + + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-10, :minute), + "event" => "old", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "old"} + }) + + since = DateTime.utc_now() |> DateTime.add(-3, :minute) |> DateTime.to_unix(:millisecond) + + assert Messages.replay(conn, "test", since, 10) == {:ok, [m1, m2], MapSet.new([m1.id, m2.id])} + end + + test "replay respects hard max limit of 25", %{conn: conn, tenant: tenant} do + for _i <- 1..30 do + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now(), + "event" => "event", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "message"} + }) end - assert length(messages) == 11 + assert {:ok, messages, set} = Messages.replay(conn, "test", 0, 30) + assert length(messages) == 25 + assert MapSet.size(set) == 25 + end + + test "replay respects hard min limit of 1", %{conn: conn, tenant: tenant} do + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now(), + "event" => "event", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "message"} + }) + + assert {:ok, messages, set} = Messages.replay(conn, "test", 0, 0) + assert length(messages) == 1 + assert MapSet.size(set) == 1 + end + + test "distributed replay", %{conn: conn, tenant: tenant} do + m = + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now(), + "event" => "event", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "message"} + }) + + {:ok, node} = Clustered.start() + + # Call remote node passing the database connection that is local to this node + assert :erpc.call(node, Messages, :replay, [conn, "test", 0, 30]) == {:ok, [m], MapSet.new([m.id])} + end + + test "distributed replay error", %{tenant: tenant} do + message_fixture(tenant, %{ + "inserted_at" => NaiveDateTime.utc_now(), + "event" => "event", + "extension" => "broadcast", + "topic" => "test", + "private" => true, + "payload" => %{"value" => "message"} + }) + + {:ok, node} = Clustered.start() + + # Call remote node passing the database connection that is local to this node + pid = spawn(fn -> :ok end) + assert :erpc.call(node, Messages, :replay, [pid, "test", 0, 30]) == {:error, :failed_to_replay_messages} + end + end + + describe "delete_old_messages/1" do + test "delete_old_messages/1 deletes messages older than 72 hours", %{ + conn: conn, + tenant: tenant, + date_start: date_start, + date_end: date_end + } do + utc_now = NaiveDateTime.utc_now() + limit = NaiveDateTime.add(utc_now, -72, :hour) + + messages = + for date <- Date.range(date_start, date_end) do + inserted_at = date |> NaiveDateTime.new!(Time.new!(0, 0, 0)) + message_fixture(tenant, %{inserted_at: inserted_at}) + end + + assert length(messages) == 11 - to_keep = - Enum.reject( - messages, - &(NaiveDateTime.compare(limit, &1.inserted_at) == :gt) - ) + to_keep = + Enum.reject( + messages, + &(NaiveDateTime.compare(NaiveDateTime.beginning_of_day(limit), &1.inserted_at) == :gt) + ) - assert :ok = Messages.delete_old_messages(conn) - {:ok, current} = Repo.all(conn, from(m in Message), Message) + assert :ok = Messages.delete_old_messages(conn) + {:ok, current} = Repo.all(conn, from(m in Message), Message) - assert Enum.sort(current) == Enum.sort(to_keep) + assert Enum.sort(current) == Enum.sort(to_keep) + end end end diff --git a/test/realtime/monitoring/erl_sys_mon_test.exs b/test/realtime/monitoring/erl_sys_mon_test.exs index b1e122d58..e9c7b87b7 100644 --- a/test/realtime/monitoring/erl_sys_mon_test.exs +++ b/test/realtime/monitoring/erl_sys_mon_test.exs @@ -5,16 +5,25 @@ defmodule Realtime.Monitoring.ErlSysMonTest do describe "system monitoring" do test "logs system monitor events" do - start_supervised!({ErlSysMon, config: [{:long_message_queue, {1, 10}}]}) + start_supervised!({ErlSysMon, config: [{:long_message_queue, {1, 100}}]}) - assert capture_log(fn -> - Task.async(fn -> - Enum.map(1..1000, &send(self(), &1)) - # Wait for ErlSysMon to notice - Process.sleep(4000) - end) - |> Task.await() - end) =~ "Realtime.ErlSysMon message:" + log = + capture_log(fn -> + Task.async(fn -> + Process.register(self(), TestProcess) + Enum.map(1..1000, &send(self(), &1)) + # Wait for ErlSysMon to notice + Process.sleep(4000) + end) + |> Task.await() + end) + + assert log =~ "Realtime.ErlSysMon message:" + assert log =~ "$initial_call\", {Realtime.Monitoring.ErlSysMonTest" + assert log =~ "ancestors\", [#{inspect(self())}]" + assert log =~ "registered_name: TestProcess" + assert log =~ "message_queue_len: " + assert log =~ "total_heap_size: " end end end diff --git a/test/realtime/monitoring/prom_ex/plugins/phoenix_test.exs b/test/realtime/monitoring/prom_ex/plugins/phoenix_test.exs index a73e6e2f5..ad9198c97 100644 --- a/test/realtime/monitoring/prom_ex/plugins/phoenix_test.exs +++ b/test/realtime/monitoring/prom_ex/plugins/phoenix_test.exs @@ -1,6 +1,7 @@ defmodule Realtime.PromEx.Plugins.PhoenixTest do use Realtime.DataCase, async: false alias Realtime.PromEx.Plugins + alias Realtime.Integration.WebsocketClient defmodule MetricsTest do use PromEx, otp_app: :realtime_test_phoenix @@ -13,16 +14,20 @@ defmodule Realtime.PromEx.Plugins.PhoenixTest do describe "pooling metrics" do setup do start_supervised!(MetricsTest) - :ok + %{tenant: Containers.checkout_tenant(run_migrations: true)} end - test "number of connections" do - # Trigger a connection by making a request to the endpoint - url = RealtimeWeb.Endpoint.url() <> "/healthcheck" - Req.get!(url) + test "number of connections", %{tenant: tenant} do + {:ok, token} = token_valid(tenant, "anon", %{}) + + {:ok, _} = + WebsocketClient.connect(self(), uri(tenant, 4002), Phoenix.Socket.V1.JSONSerializer, [{"x-api-key", token}]) + + {:ok, _} = + WebsocketClient.connect(self(), uri(tenant, 4002), Phoenix.Socket.V1.JSONSerializer, [{"x-api-key", token}]) Process.sleep(200) - assert metric_value() > 0 + assert metric_value() >= 2 end end diff --git a/test/realtime/syn_handler_test.exs b/test/realtime/syn_handler_test.exs index 2b27cf322..1cf0d3bad 100644 --- a/test/realtime/syn_handler_test.exs +++ b/test/realtime/syn_handler_test.exs @@ -168,32 +168,40 @@ defmodule Realtime.SynHandlerTest do test "it handles :syn_conflict_resolution reason" do reason = :syn_conflict_resolution + pid = self() log = capture_log(fn -> - assert SynHandler.on_process_unregistered(@mod, @name, self(), %{}, reason) == :ok + assert SynHandler.on_process_unregistered(@mod, @name, pid, %{}, reason) == :ok end) topic = "#{@topic}:#{@name}" event = "#{@topic}_down" assert log =~ "#{@mod} terminated due to syn conflict resolution: #{inspect(@name)} #{inspect(self())}" - assert_receive %Phoenix.Socket.Broadcast{topic: ^topic, event: ^event, payload: nil} + assert_receive %Phoenix.Socket.Broadcast{topic: ^topic, event: ^event, payload: %{reason: ^reason, pid: ^pid}} end test "it handles other reasons" do reason = :other_reason + pid = self() log = capture_log(fn -> - assert SynHandler.on_process_unregistered(@mod, @name, self(), %{}, reason) == :ok + assert SynHandler.on_process_unregistered(@mod, @name, pid, %{}, reason) == :ok end) topic = "#{@topic}:#{@name}" event = "#{@topic}_down" refute log =~ "#{@mod} terminated: #{inspect(@name)} #{node()}" - assert_receive %Phoenix.Socket.Broadcast{topic: ^topic, event: ^event, payload: nil}, 500 + + assert_receive %Phoenix.Socket.Broadcast{ + topic: ^topic, + event: ^event, + payload: %{reason: ^reason, pid: ^pid} + }, + 500 end end end diff --git a/test/realtime/tenants/connect_test.exs b/test/realtime/tenants/connect_test.exs index 290fb1c8d..8ba462b27 100644 --- a/test/realtime/tenants/connect_test.exs +++ b/test/realtime/tenants/connect_test.exs @@ -78,12 +78,55 @@ defmodule Realtime.Tenants.ConnectTest do assert_receive {:ok, ^pid} end - test "more than 5 seconds passed error out", %{tenant: tenant} do + test "more than 15 seconds passed error out", %{tenant: tenant} do parent = self() # Let's slow down Connect starting expect(Database, :check_tenant_connection, fn t -> - :timer.sleep(5500) + Process.sleep(15500) + call_original(Database, :check_tenant_connection, [t]) + end) + + connect = fn -> send(parent, Connect.lookup_or_start_connection(tenant.external_id)) end + + spawn(connect) + spawn(connect) + + {:error, :initializing} = Connect.lookup_or_start_connection(tenant.external_id) + # The above call waited 15 seconds + assert_receive {:error, :initializing} + assert_receive {:error, :initializing} + + # This one will succeed + {:ok, _pid} = Connect.lookup_or_start_connection(tenant.external_id) + end + + test "too many db connections", %{tenant: tenant} do + extension = %{ + "type" => "postgres_cdc_rls", + "settings" => %{ + "db_host" => "127.0.0.1", + "db_name" => "postgres", + "db_user" => "supabase_admin", + "db_password" => "postgres", + "poll_interval" => 100, + "poll_max_changes" => 100, + "poll_max_record_bytes" => 1_048_576, + "region" => "us-east-1", + "ssl_enforced" => false, + "db_pool" => 100, + "subcriber_pool_size" => 100, + "subs_pool_size" => 100 + } + } + + {:ok, tenant} = update_extension(tenant, extension) + + parent = self() + + # Let's slow down Connect starting + expect(Database, :check_tenant_connection, fn t -> + :timer.sleep(1000) call_original(Database, :check_tenant_connection, [t]) end) @@ -97,12 +140,13 @@ defmodule Realtime.Tenants.ConnectTest do spawn(connect) spawn(connect) - {:error, :tenant_database_unavailable} = Connect.lookup_or_start_connection(tenant.external_id) + # This one should block and wait for the first Connect + {:error, :tenant_db_too_many_connections} = Connect.lookup_or_start_connection(tenant.external_id) - # Only one will succeed the others timed out waiting - assert_receive {:error, :tenant_database_unavailable} - assert_receive {:error, :tenant_database_unavailable} - assert_receive {:ok, _pid}, 7000 + assert_receive {:error, :tenant_db_too_many_connections} + assert_receive {:error, :tenant_db_too_many_connections} + assert_receive {:error, :tenant_db_too_many_connections} + refute_receive _any end end @@ -267,6 +311,34 @@ defmodule Realtime.Tenants.ConnectTest do assert {:error, :tenant_suspended} = Connect.lookup_or_start_connection(tenant.external_id) end + test "tenant not able to connect if database has not enough connections", %{ + tenant: tenant + } do + extension = %{ + "type" => "postgres_cdc_rls", + "settings" => %{ + "db_host" => "127.0.0.1", + "db_name" => "postgres", + "db_user" => "supabase_admin", + "db_password" => "postgres", + "poll_interval" => 100, + "poll_max_changes" => 100, + "poll_max_record_bytes" => 1_048_576, + "region" => "us-east-1", + "ssl_enforced" => false, + "db_pool" => 100, + "subcriber_pool_size" => 100, + "subs_pool_size" => 100 + } + } + + {:ok, tenant} = update_extension(tenant, extension) + + assert capture_log(fn -> + assert {:error, :tenant_db_too_many_connections} = Connect.lookup_or_start_connection(tenant.external_id) + end) =~ ~r/Only \d+ available connections\. At least \d+ connections are required/ + end + test "handles tenant suspension and unsuspension in a reactive way", %{tenant: tenant} do assert {:ok, db_conn} = Connect.lookup_or_start_connection(tenant.external_id) assert Connect.ready?(tenant.external_id) @@ -352,11 +424,13 @@ defmodule Realtime.Tenants.ConnectTest do assert replication_connection_before == replication_connection_after end - test "on replication connection postgres pid being stopped, also kills the Connect module", %{tenant: tenant} do + test "on replication connection postgres pid being stopped, Connect module recovers it", %{tenant: tenant} do assert {:ok, db_conn} = Connect.lookup_or_start_connection(tenant.external_id) assert Connect.ready?(tenant.external_id) replication_connection_pid = ReplicationConnection.whereis(tenant.external_id) + Process.monitor(replication_connection_pid) + assert Process.alive?(replication_connection_pid) pid = Connect.whereis(tenant.external_id) @@ -366,21 +440,33 @@ defmodule Realtime.Tenants.ConnectTest do [] ) - assert_process_down(replication_connection_pid) - assert_process_down(pid) + assert_receive {:DOWN, _, :process, ^replication_connection_pid, _} + + Process.sleep(1500) + new_replication_connection_pid = ReplicationConnection.whereis(tenant.external_id) + + assert replication_connection_pid != new_replication_connection_pid + assert Process.alive?(new_replication_connection_pid) + assert Process.alive?(pid) end - test "on replication connection exit, also kills the Connect module", %{tenant: tenant} do + test "on replication connection exit, Connect module recovers it", %{tenant: tenant} do assert {:ok, _db_conn} = Connect.lookup_or_start_connection(tenant.external_id) assert Connect.ready?(tenant.external_id) replication_connection_pid = ReplicationConnection.whereis(tenant.external_id) + Process.monitor(replication_connection_pid) assert Process.alive?(replication_connection_pid) pid = Connect.whereis(tenant.external_id) Process.exit(replication_connection_pid, :kill) + assert_receive {:DOWN, _, :process, ^replication_connection_pid, _} - assert_process_down(replication_connection_pid) - assert_process_down(pid) + Process.sleep(1500) + new_replication_connection_pid = ReplicationConnection.whereis(tenant.external_id) + + assert replication_connection_pid != new_replication_connection_pid + assert Process.alive?(new_replication_connection_pid) + assert Process.alive?(pid) end test "handles max_wal_senders by logging the correct operational code", %{tenant: tenant} do @@ -449,30 +535,6 @@ defmodule Realtime.Tenants.ConnectTest do test "if tenant does not exist, does nothing" do assert :ok = Connect.shutdown("none") end - - test "tenant not able to connect if database has not enough connections", %{tenant: tenant} do - extension = %{ - "type" => "postgres_cdc_rls", - "settings" => %{ - "db_host" => "127.0.0.1", - "db_name" => "postgres", - "db_user" => "supabase_admin", - "db_password" => "postgres", - "poll_interval" => 100, - "poll_max_changes" => 100, - "poll_max_record_bytes" => 1_048_576, - "region" => "us-east-1", - "ssl_enforced" => false, - "db_pool" => 100, - "subcriber_pool_size" => 100, - "subs_pool_size" => 100 - } - } - - {:ok, tenant} = update_extension(tenant, extension) - - assert {:error, :tenant_db_too_many_connections} = Connect.lookup_or_start_connection(tenant.external_id) - end end describe "registers into local registry" do diff --git a/test/realtime/tenants/janitor/maintenance_task_test.exs b/test/realtime/tenants/janitor/maintenance_task_test.exs index f4c51436e..4c42b7ab3 100644 --- a/test/realtime/tenants/janitor/maintenance_task_test.exs +++ b/test/realtime/tenants/janitor/maintenance_task_test.exs @@ -15,9 +15,15 @@ defmodule Realtime.Tenants.Janitor.MaintenanceTaskTest do end test "cleans messages older than 72 hours and creates partitions", %{tenant: tenant} do + {:ok, conn} = Database.connect(tenant, "realtime_test", :stop) + utc_now = NaiveDateTime.utc_now() limit = NaiveDateTime.add(utc_now, -72, :hour) + date_start = Date.utc_today() |> Date.add(-10) + date_end = Date.utc_today() + create_messages_partitions(conn, date_start, date_end) + messages = for days <- -5..0 do inserted_at = NaiveDateTime.add(utc_now, days, :day) @@ -27,12 +33,11 @@ defmodule Realtime.Tenants.Janitor.MaintenanceTaskTest do to_keep = messages - |> Enum.reject(&(NaiveDateTime.compare(limit, &1.inserted_at) == :gt)) + |> Enum.reject(&(NaiveDateTime.compare(NaiveDateTime.beginning_of_day(limit), &1.inserted_at) == :gt)) |> MapSet.new() assert MaintenanceTask.run(tenant.external_id) == :ok - {:ok, conn} = Database.connect(tenant, "realtime_test", :stop) {:ok, res} = Repo.all(conn, from(m in Message), Message) verify_partitions(conn) @@ -80,7 +85,7 @@ defmodule Realtime.Tenants.Janitor.MaintenanceTaskTest do defp verify_partitions(conn) do today = Date.utc_today() - yesterday = Date.add(today, -1) + yesterday = Date.add(today, -3) future = Date.add(today, 3) dates = Date.range(yesterday, future) diff --git a/test/realtime/tenants/janitor_test.exs b/test/realtime/tenants/janitor_test.exs index 4ac1a0eda..fb597a4c4 100644 --- a/test/realtime/tenants/janitor_test.exs +++ b/test/realtime/tenants/janitor_test.exs @@ -31,6 +31,14 @@ defmodule Realtime.Tenants.JanitorTest do end ) + date_start = Date.utc_today() |> Date.add(-10) + date_end = Date.utc_today() + + Enum.map(tenants, fn tenant -> + {:ok, conn} = Database.connect(tenant, "realtime_test", :stop) + create_messages_partitions(conn, date_start, date_end) + end) + start_supervised!( {Task.Supervisor, name: Realtime.Tenants.Janitor.TaskSupervisor, max_children: 5, max_seconds: 500, max_restarts: 1} @@ -62,7 +70,7 @@ defmodule Realtime.Tenants.JanitorTest do to_keep = messages - |> Enum.reject(&(NaiveDateTime.compare(limit, &1.inserted_at) == :gt)) + |> Enum.reject(&(NaiveDateTime.compare(NaiveDateTime.beginning_of_day(limit), &1.inserted_at) == :gt)) |> MapSet.new() start_supervised!(Janitor) @@ -105,7 +113,7 @@ defmodule Realtime.Tenants.JanitorTest do to_keep = messages - |> Enum.reject(&(NaiveDateTime.compare(limit, &1.inserted_at) == :gt)) + |> Enum.reject(&(NaiveDateTime.compare(NaiveDateTime.beginning_of_day(limit), &1.inserted_at) == :gt)) |> MapSet.new() start_supervised!(Janitor) @@ -162,7 +170,7 @@ defmodule Realtime.Tenants.JanitorTest do defp verify_partitions(conn) do today = Date.utc_today() - yesterday = Date.add(today, -1) + yesterday = Date.add(today, -3) future = Date.add(today, 3) dates = Date.range(yesterday, future) diff --git a/test/realtime/tenants/replication_connection_test.exs b/test/realtime/tenants/replication_connection_test.exs index 783270313..b28a23988 100644 --- a/test/realtime/tenants/replication_connection_test.exs +++ b/test/realtime/tenants/replication_connection_test.exs @@ -98,6 +98,7 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do payload = %{ "event" => "INSERT", + "meta" => %{"id" => row.id}, "payload" => %{ "id" => row.id, "value" => value @@ -139,8 +140,9 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do "event" => "broadcast", "payload" => %{ "event" => "INSERT", + "meta" => %{"id" => id}, "payload" => %{ - "id" => _, + "id" => id, "value" => ^value } }, @@ -222,21 +224,26 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do "payload" => %{"value" => "something"} }) + fixture_id = fixture.id + assert_receive {:socket_push, :text, data}, 500 message = data |> IO.iodata_to_binary() |> Jason.decode!() assert %{ "event" => "broadcast", - "payload" => %{"event" => "INSERT", "payload" => payload, "type" => "broadcast"}, + "payload" => %{ + "event" => "INSERT", + "meta" => %{"id" => ^fixture_id}, + "payload" => payload, + "type" => "broadcast" + }, "ref" => nil, "topic" => ^topic } = message - id = fixture.id - assert payload == %{ "value" => "something", - "id" => id + "id" => fixture_id } end @@ -252,19 +259,25 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do payload = %{"value" => "something", "id" => "123456"} - message_fixture(tenant, %{ - "topic" => topic, - "private" => true, - "event" => "INSERT", - "payload" => payload - }) + %{id: fixture_id} = + message_fixture(tenant, %{ + "topic" => topic, + "private" => true, + "event" => "INSERT", + "payload" => payload + }) assert_receive {:socket_push, :text, data}, 500 message = data |> IO.iodata_to_binary() |> Jason.decode!() assert %{ "event" => "broadcast", - "payload" => %{"event" => "INSERT", "payload" => ^payload, "type" => "broadcast"}, + "payload" => %{ + "meta" => %{"id" => ^fixture_id}, + "event" => "INSERT", + "payload" => ^payload, + "type" => "broadcast" + }, "ref" => nil, "topic" => ^topic } = message @@ -331,6 +344,26 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do assert {:error, :max_wal_senders_reached} = ReplicationConnection.start(tenant, self()) end + + test "handles WAL pressure gracefully", %{tenant: tenant} do + {:ok, replication_pid} = ReplicationConnection.start(tenant, self()) + + {:ok, conn} = Database.connect(tenant, "realtime_test", :stop) + on_exit(fn -> Process.exit(conn, :normal) end) + + large_payload = String.duplicate("x", 10 * 1024 * 1024) + + for i <- 1..5 do + message_fixture_with_conn(tenant, conn, %{ + "topic" => "stress_#{i}", + "private" => true, + "event" => "INSERT", + "payload" => %{"data" => large_payload} + }) + end + + assert Process.alive?(replication_pid) + end end describe "whereis/1" do @@ -409,4 +442,20 @@ defmodule Realtime.Tenants.ReplicationConnectionTest do ref = Process.monitor(pid) assert_receive {:DOWN, ^ref, :process, ^pid, _reason}, timeout end + + defp message_fixture_with_conn(_tenant, conn, override) do + create_attrs = %{ + "topic" => random_string(), + "extension" => "broadcast" + } + + override = override |> Enum.map(fn {k, v} -> {"#{k}", v} end) |> Map.new() + + {:ok, message} = + create_attrs + |> Map.merge(override) + |> TenantConnection.create_message(conn) + + message + end end diff --git a/test/realtime_web/channels/payloads/join_test.exs b/test/realtime_web/channels/payloads/join_test.exs index 32bf1b397..c1ea54a67 100644 --- a/test/realtime_web/channels/payloads/join_test.exs +++ b/test/realtime_web/channels/payloads/join_test.exs @@ -6,6 +6,7 @@ defmodule RealtimeWeb.Channels.Payloads.JoinTest do alias RealtimeWeb.Channels.Payloads.Join alias RealtimeWeb.Channels.Payloads.Config alias RealtimeWeb.Channels.Payloads.Broadcast + alias RealtimeWeb.Channels.Payloads.Broadcast.Replay alias RealtimeWeb.Channels.Payloads.Presence alias RealtimeWeb.Channels.Payloads.PostgresChange @@ -17,7 +18,7 @@ defmodule RealtimeWeb.Channels.Payloads.JoinTest do config = %{ "config" => %{ "private" => false, - "broadcast" => %{"ack" => false, "self" => false}, + "broadcast" => %{"ack" => false, "self" => false, "replay" => %{"since" => 1, "limit" => 10}}, "presence" => %{"enabled" => true, "key" => key}, "postgres_changes" => [ %{"event" => "INSERT", "schema" => "public", "table" => "users", "filter" => "id=eq.1"}, @@ -37,8 +38,9 @@ defmodule RealtimeWeb.Channels.Payloads.JoinTest do postgres_changes: postgres_changes } = config - assert %Broadcast{ack: false, self: false} = broadcast + assert %Broadcast{ack: false, self: false, replay: replay} = broadcast assert %Presence{enabled: true, key: ^key} = presence + assert %Replay{since: 1, limit: 10} = replay assert [ %PostgresChange{event: "INSERT", schema: "public", table: "users", filter: "id=eq.1"}, @@ -56,6 +58,17 @@ defmodule RealtimeWeb.Channels.Payloads.JoinTest do assert is_binary(key) end + test "invalid replay" do + config = %{"config" => %{"broadcast" => %{"replay" => 123}}} + + assert { + :error, + :invalid_join_payload, + %{config: %{broadcast: %{replay: ["unable to parse, expected a map"]}}} + } = + Join.validate(config) + end + test "missing enabled presence defaults to true" do config = %{"config" => %{"presence" => %{}}} diff --git a/test/realtime_web/channels/realtime_channel/logging_test.exs b/test/realtime_web/channels/realtime_channel/logging_test.exs index 92634daef..cd131d16e 100644 --- a/test/realtime_web/channels/realtime_channel/logging_test.exs +++ b/test/realtime_web/channels/realtime_channel/logging_test.exs @@ -37,6 +37,7 @@ defmodule RealtimeWeb.RealtimeChannel.LoggingTest do assert log =~ "sub=#{sub}" assert log =~ "exp=#{exp}" assert log =~ "iss=#{iss}" + assert log =~ "error_code=TestError" end end @@ -57,6 +58,7 @@ defmodule RealtimeWeb.RealtimeChannel.LoggingTest do assert log =~ "sub=#{sub}" assert log =~ "exp=#{exp}" assert log =~ "iss=#{iss}" + assert log =~ "error_code=TestWarning" end end @@ -67,10 +69,14 @@ defmodule RealtimeWeb.RealtimeChannel.LoggingTest do for log_level <- log_levels do socket = %{assigns: %{log_level: log_level, tenant: random_string(), access_token: "test_token"}} - assert capture_log(fn -> - assert Logging.maybe_log_error(socket, "TestCode", "test message") == - {:error, %{reason: "TestCode: test message"}} - end) =~ "TestCode: test message" + log = + capture_log(fn -> + assert Logging.maybe_log_error(socket, "TestCode", "test message") == + {:error, %{reason: "TestCode: test message"}} + end) + + assert log =~ "TestCode: test message" + assert log =~ "error_code=TestCode" assert capture_log(fn -> assert Logging.maybe_log_error(socket, "TestCode", %{a: "b"}) == @@ -103,11 +109,14 @@ defmodule RealtimeWeb.RealtimeChannel.LoggingTest do for log_level <- log_levels do socket = %{assigns: %{log_level: log_level, tenant: random_string(), access_token: "test_token"}} - assert capture_log(fn -> - assert Logging.maybe_log_warning(socket, "TestCode", "test message") == - {:error, %{reason: "TestCode: test message"}} - end) =~ - "TestCode: test message" + log = + capture_log(fn -> + assert Logging.maybe_log_warning(socket, "TestCode", "test message") == + {:error, %{reason: "TestCode: test message"}} + end) + + assert log =~ "TestCode: test message" + assert log =~ "error_code=TestCode" assert capture_log(fn -> assert Logging.maybe_log_warning(socket, "TestCode", %{a: "b"}) == diff --git a/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs b/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs index 7a9e2eb25..44ce83b99 100644 --- a/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs +++ b/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs @@ -16,12 +16,12 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do describe "fastlane_metadata/5" do test "info level" do assert MessageDispatcher.fastlane_metadata(self(), Serializer, "realtime:topic", :info, "tenant_id") == - {:realtime_channel_fastlane, self(), Serializer, "realtime:topic", {:log, "tenant_id"}} + {:rc_fastlane, self(), Serializer, "realtime:topic", {:log, "tenant_id"}, MapSet.new()} end test "non-info level" do assert MessageDispatcher.fastlane_metadata(self(), Serializer, "realtime:topic", :warning, "tenant_id") == - {:realtime_channel_fastlane, self(), Serializer, "realtime:topic"} + {:rc_fastlane, self(), Serializer, "realtime:topic", MapSet.new()} end end @@ -50,12 +50,11 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do from_pid = :erlang.list_to_pid(~c'<0.2.1>') subscribers = [ - {subscriber_pid, {:realtime_channel_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}}}, - {subscriber_pid, {:realtime_channel_fastlane, self(), TestSerializer, "realtime:topic"}} + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, MapSet.new()}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", MapSet.new()}} ] msg = %Broadcast{topic: "some:other:topic", event: "event", payload: %{data: "test"}} - require Logger log = capture_log(fn -> @@ -75,6 +74,87 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do refute_receive _any end + test "does not dispatch messages to fastlane subscribers if they already replayed it" do + parent = self() + + subscriber_pid = + spawn(fn -> + loop = fn loop -> + receive do + msg -> + send(parent, {:subscriber, msg}) + loop.(loop) + end + end + + loop.(loop) + end) + + from_pid = :erlang.list_to_pid(~c'<0.2.1>') + replaeyd_message_ids = MapSet.new(["123"]) + + subscribers = [ + {subscriber_pid, + {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, replaeyd_message_ids}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", replaeyd_message_ids}} + ] + + msg = %Broadcast{ + topic: "some:other:topic", + event: "event", + payload: %{"data" => "test", "meta" => %{"id" => "123"}} + } + + assert MessageDispatcher.dispatch(subscribers, from_pid, msg) == :ok + + assert Agent.get(TestSerializer, & &1) == 0 + + refute_receive _any + end + + test "payload is not a map" do + parent = self() + + subscriber_pid = + spawn(fn -> + loop = fn loop -> + receive do + msg -> + send(parent, {:subscriber, msg}) + loop.(loop) + end + end + + loop.(loop) + end) + + from_pid = :erlang.list_to_pid(~c'<0.2.1>') + + subscribers = [ + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, MapSet.new()}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", MapSet.new()}} + ] + + msg = %Broadcast{topic: "some:other:topic", event: "event", payload: "not a map"} + + log = + capture_log(fn -> + assert MessageDispatcher.dispatch(subscribers, from_pid, msg) == :ok + end) + + assert log =~ "Received message on realtime:topic with payload: #{inspect(msg, pretty: true)}" + + assert_receive {:encoded, %Broadcast{event: "event", payload: "not a map", topic: "realtime:topic"}} + assert_receive {:encoded, %Broadcast{event: "event", payload: "not a map", topic: "realtime:topic"}} + + assert Agent.get(TestSerializer, & &1) == 1 + + assert_receive {:subscriber, :update_rate_counter} + assert_receive {:subscriber, :update_rate_counter} + + refute_receive _any + end + test "dispatches messages to non fastlane subscribers" do from_pid = :erlang.list_to_pid(~c'<0.2.1>') diff --git a/test/realtime_web/channels/realtime_channel/presence_handler_test.exs b/test/realtime_web/channels/realtime_channel/presence_handler_test.exs index e5ecd32ad..0cdf422e2 100644 --- a/test/realtime_web/channels/realtime_channel/presence_handler_test.exs +++ b/test/realtime_web/channels/realtime_channel/presence_handler_test.exs @@ -99,7 +99,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do end end - describe "handle/2" do + describe "handle/3" do test "with true policy and is private, user can track their presence and changes", %{ tenant: tenant, topic: topic, @@ -142,7 +142,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do policies = %Policies{presence: %PresencePolicies{read: false, write: false}} socket = socket_fixture(tenant, topic, key, policies: policies, private?: false) - assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "track"}, socket) + assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "track"}, nil, socket) topic = socket.assigns.tenant_topic assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} @@ -229,6 +229,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do assert {:ok, socket} = PresenceHandler.handle( %{"event" => "track", "payload" => %{"metadata" => random_string()}}, + nil, socket ) @@ -248,7 +249,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do assert log =~ "UnknownPresenceEvent" end - test "socket with presence enabled false will ignore presence events in public channel", %{ + test "socket with presence enabled false will ignore non-track presence events in public channel", %{ tenant: tenant, topic: topic } do @@ -256,12 +257,12 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do policies = %Policies{presence: %PresencePolicies{read: true, write: true}} socket = socket_fixture(tenant, topic, key, policies: policies, private?: false, enabled?: false) - assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "track"}, socket) + assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "untrack"}, nil, socket) topic = socket.assigns.tenant_topic refute_receive %Broadcast{topic: ^topic, event: "presence_diff"} end - test "socket with presence enabled false will ignore presence events in private channel", %{ + test "socket with presence enabled false will ignore non-track presence events in private channel", %{ tenant: tenant, topic: topic, db_conn: db_conn @@ -270,11 +271,80 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do policies = %Policies{presence: %PresencePolicies{read: true, write: true}} socket = socket_fixture(tenant, topic, key, policies: policies, private?: false, enabled?: false) - assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "track"}, db_conn, socket) + assert {:ok, _socket} = PresenceHandler.handle(%{"event" => "untrack"}, db_conn, socket) topic = socket.assigns.tenant_topic refute_receive %Broadcast{topic: ^topic, event: "presence_diff"} end + test "socket with presence disabled will enable presence on track message for public channel", %{ + tenant: tenant, + topic: topic + } do + key = random_string() + policies = %Policies{presence: %PresencePolicies{read: true, write: true}} + socket = socket_fixture(tenant, topic, key, policies: policies, private?: false, enabled?: false) + + refute socket.assigns.presence_enabled? + + assert {:ok, updated_socket} = PresenceHandler.handle(%{"event" => "track"}, nil, socket) + + assert updated_socket.assigns.presence_enabled? + topic = socket.assigns.tenant_topic + assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} + assert Map.has_key?(joins, key) + end + + test "socket with presence disabled will enable presence on track message for private channel", %{ + tenant: tenant, + topic: topic, + db_conn: db_conn + } do + key = random_string() + policies = %Policies{presence: %PresencePolicies{read: true, write: true}} + socket = socket_fixture(tenant, topic, key, policies: policies, private?: true, enabled?: false) + + refute socket.assigns.presence_enabled? + + assert {:ok, updated_socket} = PresenceHandler.handle(%{"event" => "track"}, db_conn, socket) + + assert updated_socket.assigns.presence_enabled? + topic = socket.assigns.tenant_topic + assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} + assert Map.has_key?(joins, key) + end + + test "socket with presence disabled will not enable presence on untrack message", %{ + tenant: tenant, + topic: topic, + db_conn: db_conn + } do + key = random_string() + policies = %Policies{presence: %PresencePolicies{read: true, write: true}} + socket = socket_fixture(tenant, topic, key, policies: policies, enabled?: false) + + refute socket.assigns.presence_enabled? + + assert {:ok, updated_socket} = PresenceHandler.handle(%{"event" => "untrack"}, db_conn, socket) + + refute updated_socket.assigns.presence_enabled? + topic = socket.assigns.tenant_topic + refute_receive %Broadcast{topic: ^topic, event: "presence_diff"} + end + + test "socket with presence disabled will not enable presence on unknown event", %{ + tenant: tenant, + topic: topic, + db_conn: db_conn + } do + key = random_string() + policies = %Policies{presence: %PresencePolicies{read: true, write: true}} + socket = socket_fixture(tenant, topic, key, policies: policies, enabled?: false) + + refute socket.assigns.presence_enabled? + + assert {:error, :unknown_presence_event} = PresenceHandler.handle(%{"event" => "unknown"}, db_conn, socket) + end + @tag policies: [:authenticated_read_broadcast_and_presence, :authenticated_write_broadcast_and_presence] test "rate limit is checked on private channel", %{tenant: tenant, topic: topic, db_conn: db_conn} do key = random_string() diff --git a/test/realtime_web/channels/realtime_channel_test.exs b/test/realtime_web/channels/realtime_channel_test.exs index 2dff83da3..0a0d8aca9 100644 --- a/test/realtime_web/channels/realtime_channel_test.exs +++ b/test/realtime_web/channels/realtime_channel_test.exs @@ -28,6 +28,216 @@ defmodule RealtimeWeb.RealtimeChannelTest do setup :rls_context + test "max heap size is set", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{}, conn_opts(tenant, jwt)) + + assert Process.info(socket.transport_pid, :max_heap_size) == + {:max_heap_size, %{error_logger: true, include_shared_binaries: false, kill: true, size: 6_250_000}} + end + + describe "broadcast" do + @describetag policies: [:authenticated_all_topic_read] + + test "broadcast map payload", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{}, conn_opts(tenant, jwt)) + + config = %{ + "presence" => %{"enabled" => false}, + "broadcast" => %{"self" => true} + } + + assert {:ok, _, socket} = subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + push(socket, "broadcast", %{"event" => "my_event", "payload" => %{"hello" => "world"}}) + + assert_receive %Phoenix.Socket.Message{ + topic: "realtime:test", + event: "broadcast", + payload: %{"event" => "my_event", "payload" => %{"hello" => "world"}} + } + end + + test "broadcast non-map payload", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{}, conn_opts(tenant, jwt)) + + config = %{ + "presence" => %{"enabled" => false}, + "broadcast" => %{"self" => true} + } + + assert {:ok, _, socket} = subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + push(socket, "broadcast", "not a map") + + assert_receive %Phoenix.Socket.Message{ + topic: "realtime:test", + event: "broadcast", + payload: "not a map" + } + end + + test "wrong replay params", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{"log_level" => "warning"}, conn_opts(tenant, jwt)) + + config = %{ + "private" => true, + "broadcast" => %{ + "replay" => %{"limit" => "not a number", "since" => :erlang.system_time(:millisecond) - 5 * 60000} + } + } + + assert {:error, %{reason: "UnableToReplayMessages: Replay params are not valid"}} = + subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + config = %{ + "private" => true, + "broadcast" => %{ + "replay" => %{"limit" => 1, "since" => "not a number"} + } + } + + assert {:error, %{reason: "UnableToReplayMessages: Replay params are not valid"}} = + subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + config = %{ + "private" => true, + "broadcast" => %{ + "replay" => %{} + } + } + + assert {:error, %{reason: "UnableToReplayMessages: Replay params are not valid"}} = + subscribe_and_join(socket, "realtime:test", %{"config" => config}) + end + + test "failure to replay", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{"log_level" => "warning"}, conn_opts(tenant, jwt)) + + config = %{ + "private" => true, + "broadcast" => %{ + "replay" => %{"limit" => 12, "since" => :erlang.system_time(:millisecond) - 5 * 60000} + } + } + + Authorization + |> expect(:get_read_authorizations, fn _, _, _ -> + {:ok, + %Authorization.Policies{ + broadcast: %Authorization.Policies.BroadcastPolicies{read: true, write: nil} + }} + end) + + # Broken database connection + conn = spawn(fn -> :ok end) + Connect.lookup_or_start_connection(tenant.external_id) + {:ok, _} = :syn.update_registry(Connect, tenant.external_id, fn _pid, meta -> %{meta | conn: conn} end) + + assert {:error, %{reason: "UnableToReplayMessages: Realtime was unable to replay messages"}} = + subscribe_and_join(socket, "realtime:test", %{"config" => config}) + end + + test "replay messages on public topic not allowed", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{"log_level" => "warning"}, conn_opts(tenant, jwt)) + + config = %{ + "presence" => %{"enabled" => false}, + "broadcast" => %{"replay" => %{"limit" => 2, "since" => :erlang.system_time(:millisecond) - 5 * 60000}} + } + + assert { + :error, + %{reason: "UnableToReplayMessages: Replay params are not valid"} + } = subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + refute_receive _any + end + + @tag policies: [:authenticated_all_topic_read] + test "replay messages on private topic", %{tenant: tenant} do + jwt = Generators.generate_jwt_token(tenant) + {:ok, %Socket{} = socket} = connect(UserSocket, %{"log_level" => "warning"}, conn_opts(tenant, jwt)) + + # Old message + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :day), + "event" => "old", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "old"} + }) + + %{id: message1_id} = + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-1, :minute), + "event" => "first", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "first"} + }) + + %{id: message2_id} = + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-2, :minute), + "event" => "second", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "second"} + }) + + # This one should not be received because of the limit + message_fixture(tenant, %{ + "private" => true, + "inserted_at" => NaiveDateTime.utc_now() |> NaiveDateTime.add(-3, :minute), + "event" => "third", + "extension" => "broadcast", + "topic" => "test", + "payload" => %{"value" => "third"} + }) + + config = %{ + "private" => true, + "presence" => %{"enabled" => false}, + "broadcast" => %{"replay" => %{"limit" => 2, "since" => :erlang.system_time(:millisecond) - 5 * 60000}} + } + + assert {:ok, _, %Socket{}} = subscribe_and_join(socket, "realtime:test", %{"config" => config}) + + assert_receive %Socket.Message{ + topic: "realtime:test", + event: "broadcast", + payload: %{ + "event" => "first", + "meta" => %{"id" => ^message1_id, "replayed" => true}, + "payload" => %{"value" => "first"}, + "type" => "broadcast" + } + } + + assert_receive %Socket.Message{ + topic: "realtime:test", + event: "broadcast", + payload: %{ + "event" => "second", + "meta" => %{"id" => ^message2_id, "replayed" => true}, + "payload" => %{"value" => "second"}, + "type" => "broadcast" + } + } + + refute_receive %Socket.Message{} + end + end + describe "presence" do test "events are counted", %{tenant: tenant} do jwt = Generators.generate_jwt_token(tenant) diff --git a/test/realtime_web/tenant_broadcaster_test.exs b/test/realtime_web/tenant_broadcaster_test.exs index d9afbf641..ddda381a1 100644 --- a/test/realtime_web/tenant_broadcaster_test.exs +++ b/test/realtime_web/tenant_broadcaster_test.exs @@ -1,5 +1,5 @@ defmodule RealtimeWeb.TenantBroadcasterTest do - # Usage of Clustered + # Usage of Clustered and changing Application env use Realtime.DataCase, async: false alias Phoenix.Socket.Broadcast @@ -47,95 +47,107 @@ defmodule RealtimeWeb.TenantBroadcasterTest do pid: self() ) + original = Application.fetch_env!(:realtime, :pubsub_adapter) + on_exit(fn -> Application.put_env(:realtime, :pubsub_adapter, original) end) + Application.put_env(:realtime, :pubsub_adapter, context.pubsub_adapter) + :ok end - describe "pubsub_broadcast/4" do - test "pubsub_broadcast", %{node: node} do - message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + for pubsub_adapter <- [:gen_rpc, :pg2] do + describe "pubsub_broadcast/4 #{pubsub_adapter}" do + @describetag pubsub_adapter: pubsub_adapter - assert_receive ^message + test "pubsub_broadcast", %{node: node} do + message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) - # Remote node received the broadcast - assert_receive {:relay, ^node, ^message} + assert_receive ^message - assert_receive { - :telemetry, - [:realtime, :tenants, :payload, :size], - %{size: 114}, - %{tenant: "realtime-dev"} - } - end + # Remote node received the broadcast + assert_receive {:relay, ^node, ^message} - test "pubsub_broadcast list payload", %{node: node} do - message = %Broadcast{topic: @topic, event: "an event", payload: ["a", %{"b" => "c"}, 1, 23]} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + assert_receive { + :telemetry, + [:realtime, :tenants, :payload, :size], + %{size: 114}, + %{tenant: "realtime-dev"} + } + end - assert_receive ^message + test "pubsub_broadcast list payload", %{node: node} do + message = %Broadcast{topic: @topic, event: "an event", payload: ["a", %{"b" => "c"}, 1, 23]} + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) - # Remote node received the broadcast - assert_receive {:relay, ^node, ^message} + assert_receive ^message - assert_receive { - :telemetry, - [:realtime, :tenants, :payload, :size], - %{size: 130}, - %{tenant: "realtime-dev"} - } - end + # Remote node received the broadcast + assert_receive {:relay, ^node, ^message} - test "pubsub_broadcast string payload", %{node: node} do - message = %Broadcast{topic: @topic, event: "an event", payload: "some text payload"} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + assert_receive { + :telemetry, + [:realtime, :tenants, :payload, :size], + %{size: 130}, + %{tenant: "realtime-dev"} + } + end - assert_receive ^message + test "pubsub_broadcast string payload", %{node: node} do + message = %Broadcast{topic: @topic, event: "an event", payload: "some text payload"} + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) - # Remote node received the broadcast - assert_receive {:relay, ^node, ^message} + assert_receive ^message - assert_receive { - :telemetry, - [:realtime, :tenants, :payload, :size], - %{size: 119}, - %{tenant: "realtime-dev"} - } + # Remote node received the broadcast + assert_receive {:relay, ^node, ^message} + + assert_receive { + :telemetry, + [:realtime, :tenants, :payload, :size], + %{size: 119}, + %{tenant: "realtime-dev"} + } + end end end - describe "pubsub_broadcast_from/5" do - test "pubsub_broadcast_from", %{node: node} do - parent = self() + for pubsub_adapter <- [:gen_rpc, :pg2] do + describe "pubsub_broadcast_from/5 #{pubsub_adapter}" do + @describetag pubsub_adapter: pubsub_adapter + + test "pubsub_broadcast_from", %{node: node} do + parent = self() - spawn_link(fn -> - Endpoint.subscribe(@topic) - send(parent, :ready) + spawn_link(fn -> + Endpoint.subscribe(@topic) + send(parent, :ready) - receive do - msg -> send(parent, {:other_process, msg}) - end - end) + receive do + msg -> send(parent, {:other_process, msg}) + end + end) - assert_receive :ready + assert_receive :ready - message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} + message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} - TenantBroadcaster.pubsub_broadcast_from("realtime-dev", self(), @topic, message, Phoenix.PubSub) + TenantBroadcaster.pubsub_broadcast_from("realtime-dev", self(), @topic, message, Phoenix.PubSub) - assert_receive {:other_process, ^message} + assert_receive {:other_process, ^message} - # Remote node received the broadcast - assert_receive {:relay, ^node, ^message} + # Remote node received the broadcast + assert_receive {:relay, ^node, ^message} - assert_receive { - :telemetry, - [:realtime, :tenants, :payload, :size], - %{size: 114}, - %{tenant: "realtime-dev"} - } + assert_receive { + :telemetry, + [:realtime, :tenants, :payload, :size], + %{size: 114}, + %{tenant: "realtime-dev"} + } - # This process does not receive the message - refute_receive _any + # This process does not receive the message + refute_receive _any + end end end diff --git a/test/support/containers.ex b/test/support/containers.ex index cd66f2699..bc49fa275 100644 --- a/test/support/containers.ex +++ b/test/support/containers.ex @@ -267,7 +267,13 @@ defmodule Containers do @image, "postgres", "-c", - "config_file=/etc/postgresql/postgresql.conf" + "config_file=/etc/postgresql/postgresql.conf", + "-c", + "wal_keep_size=32MB", + "-c", + "max_wal_size=32MB", + "-c", + "max_slot_wal_keep_size=32MB" ]) end end From b5161089f87645c64f4e833984d6bf5d7cff7e35 Mon Sep 17 00:00:00 2001 From: "Al @h0lybyte" <5599058+h0lybyte@users.noreply.github.com> Date: Wed, 24 Sep 2025 21:16:36 -0400 Subject: [PATCH 5/8] =?UTF-8?q?=F0=9F=94=84=20Sync=20with=20upstream=20cha?= =?UTF-8?q?nges=20(#9)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: runtime setup error (#1520) * fix: use primary instead of replica on rename_settings_field (#1521) * feat: upgrade cowboy & ranch (#1523) * fix: Fix GenRpc to not try to connect to nodes that are not alive (#1525) * fix: enable presence on track message (#1527) currently the user would need to have enabled from the beginning of the channel. this will enable users to enable presence later in the flow by sending a track message which will enable presence messages for them * fix: set cowboy active_n=100 as cowboy 2.12.0 (#1530) cowboy 2.13.0 set the default active_n=1 * fix: provide error_code metadata on RealtimeChannel.Logging (#1531) * feat: disable UTF8 validation on websocket frames (#1532) Currently all text frames as handled only with JSON which already requires UTF-8 * fix: move DB setup to happen after Connect.init (#1533) This change reduces the impact of slow DB setup impacting other tenants trying to connect at the same time that landed on the same partition * fix: handle wal bloat (#1528) Verify that replication connection is able to reconnect when faced with WAL bloat issues * feat: replay realtime.messages (#1526) A new index was created on inserted_at DESC, topic WHERE private IS TRUE AND extension = "broadast" The hardcoded limit is 25 for now. * feat: gen_rpc pub sub adapter (#1529) Add a PubSub adapter that uses gen_rpc to send messages to other nodes. It uses :gen_rpc.abcast/3 instead of :erlang.send/2 The adapter works very similarly to the PG2 adapter. It consists of multiple workers that forward to the local node using PubSub.local_broadcast. The way to choose the worker to be used is based on the sending process just like PG2 adapter does The number of workers is controlled by `:pool_size` or `:broadcast_pool_size`. This distinction exists because Phoenix.PubSub uses `:pool_size` to define how many partitions the PubSub registry will use. It's possible to control them separately by using `:broadcast_pool_size` * fix: ensure message id doesn't raise on non-map payloads (#1534) * fix: match error on Connect (#1536) --------- Co-authored-by: Eduardo Gurgel Pinho * feat: websocket max heap size configuration (#1538) * fix: set max process heap size to 500MB instead of 8GB * feat: set websocket transport max heap size WEBSOCKET_MAX_HEAP_SIZE can be used to configure it * fix: update gen_rpc to fix gen_rpc_dispatcher issues (#1537) Issues: * Single gen_rpc_dispatcher that can be a bottleneck if the connecting takes some time * Many calls can land on the dispatcher but the node might be gone already. If we don't validate the node it might keep trying to connect until it times out instead of quickly giving up due to not being an actively connected node. * fix: improve ErlSysMon logging for processes (#1540) Include initial_call, ancestors, registered_name, message_queue_len and total_heap_size Also bump long_schedule and long_gc * fix: make pubsub adapter configurable (#1539) --------- Co-authored-by: Filipe Cabaço Co-authored-by: Eduardo Gurgel Co-authored-by: Bradley Haljendi <5642609+Fudster@users.noreply.github.com> --- lib/realtime/api.ex | 2 +- mix.exs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/realtime/api.ex b/lib/realtime/api.ex index ac1f7d291..f612a5c1e 100644 --- a/lib/realtime/api.ex +++ b/lib/realtime/api.ex @@ -186,7 +186,7 @@ defmodule Realtime.Api do |> repo.preload(:extensions) end - def list_extensions(type) do + defp list_extensions(type) do query = from(e in Extensions, where: e.type == ^type, select: e) Repo.all(query) diff --git a/mix.exs b/mix.exs index 1db3bf5b8..9c66b3dde 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.46.4", + version: "2.51.3", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, From e962a57a28aa0d46feb1401b768925aa43c4e3f9 Mon Sep 17 00:00:00 2001 From: "Al @h0lybyte" <5599058+h0lybyte@users.noreply.github.com> Date: Sat, 4 Oct 2025 03:11:46 -0400 Subject: [PATCH 6/8] =?UTF-8?q?=F0=9F=94=84=20Sync=20with=20upstream=20cha?= =?UTF-8?q?nges=20(#11)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: runtime setup error (#1520) * fix: use primary instead of replica on rename_settings_field (#1521) * feat: upgrade cowboy & ranch (#1523) * fix: Fix GenRpc to not try to connect to nodes that are not alive (#1525) * fix: enable presence on track message (#1527) currently the user would need to have enabled from the beginning of the channel. this will enable users to enable presence later in the flow by sending a track message which will enable presence messages for them * fix: set cowboy active_n=100 as cowboy 2.12.0 (#1530) cowboy 2.13.0 set the default active_n=1 * fix: provide error_code metadata on RealtimeChannel.Logging (#1531) * feat: disable UTF8 validation on websocket frames (#1532) Currently all text frames as handled only with JSON which already requires UTF-8 * fix: move DB setup to happen after Connect.init (#1533) This change reduces the impact of slow DB setup impacting other tenants trying to connect at the same time that landed on the same partition * fix: handle wal bloat (#1528) Verify that replication connection is able to reconnect when faced with WAL bloat issues * feat: replay realtime.messages (#1526) A new index was created on inserted_at DESC, topic WHERE private IS TRUE AND extension = "broadast" The hardcoded limit is 25 for now. * feat: gen_rpc pub sub adapter (#1529) Add a PubSub adapter that uses gen_rpc to send messages to other nodes. It uses :gen_rpc.abcast/3 instead of :erlang.send/2 The adapter works very similarly to the PG2 adapter. It consists of multiple workers that forward to the local node using PubSub.local_broadcast. The way to choose the worker to be used is based on the sending process just like PG2 adapter does The number of workers is controlled by `:pool_size` or `:broadcast_pool_size`. This distinction exists because Phoenix.PubSub uses `:pool_size` to define how many partitions the PubSub registry will use. It's possible to control them separately by using `:broadcast_pool_size` * fix: ensure message id doesn't raise on non-map payloads (#1534) * fix: match error on Connect (#1536) --------- Co-authored-by: Eduardo Gurgel Pinho * feat: websocket max heap size configuration (#1538) * fix: set max process heap size to 500MB instead of 8GB * feat: set websocket transport max heap size WEBSOCKET_MAX_HEAP_SIZE can be used to configure it * fix: update gen_rpc to fix gen_rpc_dispatcher issues (#1537) Issues: * Single gen_rpc_dispatcher that can be a bottleneck if the connecting takes some time * Many calls can land on the dispatcher but the node might be gone already. If we don't validate the node it might keep trying to connect until it times out instead of quickly giving up due to not being an actively connected node. * fix: improve ErlSysMon logging for processes (#1540) Include initial_call, ancestors, registered_name, message_queue_len and total_heap_size Also bump long_schedule and long_gc * fix: make pubsub adapter configurable (#1539) * fix: specify that only private channels are allowed when replaying (#1543) messages * fix: rate limit connect module (#1541) On bad connection, we rate limit the Connect module so we prevent abuses and too much logging of errors --------- Co-authored-by: Filipe Cabaço Co-authored-by: Eduardo Gurgel Co-authored-by: Bradley Haljendi <5642609+Fudster@users.noreply.github.com> --- README.md | 1 + lib/realtime/tenants.ex | 26 ++++++++++ lib/realtime/tenants/connect.ex | 32 +++++++------ lib/realtime_web/channels/realtime_channel.ex | 9 +++- mix.exs | 2 +- test/realtime/tenants/connect_test.exs | 47 +++++++++++++++++++ .../channels/realtime_channel_test.exs | 2 +- .../controllers/broadcast_controller_test.exs | 18 +++++-- 8 files changed, 116 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 4e13e44df..7dd223bf3 100644 --- a/README.md +++ b/README.md @@ -243,6 +243,7 @@ This is the list of operational codes that can help you understand your deployme | ChannelRateLimitReached | The number of channels you can create has reached its limit | | ConnectionRateLimitReached | The number of connected clients as reached its limit | | ClientJoinRateLimitReached | The rate of joins per second from your clients has reached the channel limits | +| DatabaseConnectionRateLimitReached | The rate of attempts to connect to tenants database has reached the limit | | MessagePerSecondRateLimitReached | The rate of messages per second from your clients has reached the channel limits | | RealtimeDisabledForTenant | Realtime has been disabled for the tenant | | UnableToConnectToTenantDatabase | Realtime was not able to connect to the tenant's database | diff --git a/lib/realtime/tenants.ex b/lib/realtime/tenants.ex index 63965abea..db2a02cc4 100644 --- a/lib/realtime/tenants.ex +++ b/lib/realtime/tenants.ex @@ -328,6 +328,32 @@ defmodule Realtime.Tenants do %RateCounter.Args{id: {:channel, :authorization_errors, external_id}, opts: opts} end + @connect_per_second_default 10 + @doc "RateCounter arguments for counting connect per second." + @spec connect_per_second_rate(Tenant.t() | String.t()) :: RateCounter.Args.t() + def connect_per_second_rate(%Tenant{external_id: external_id}) do + connect_per_second_rate(external_id) + end + + def connect_per_second_rate(tenant_id) do + opts = [ + max_bucket_len: 10, + limit: [ + value: @connect_per_second_default, + measurement: :sum, + log_fn: fn -> + Logger.critical( + "DatabaseConnectionRateLimitReached: Too many connection attempts against the tenant database", + external_id: tenant_id, + project: tenant_id + ) + end + ] + ] + + %RateCounter.Args{id: {:database, :connect, tenant_id}, opts: opts} + end + defp pool_size(%{extensions: [%{settings: settings} | _]}) do Database.pool_size_by_application_name("realtime_connect", settings) end diff --git a/lib/realtime/tenants/connect.ex b/lib/realtime/tenants/connect.ex index 3d8f39833..0ee43f161 100644 --- a/lib/realtime/tenants/connect.ex +++ b/lib/realtime/tenants/connect.ex @@ -11,8 +11,9 @@ defmodule Realtime.Tenants.Connect do use Realtime.Logs - alias Realtime.Tenants.Rebalancer alias Realtime.Api.Tenant + alias Realtime.GenCounter + alias Realtime.RateCounter alias Realtime.Rpc alias Realtime.Tenants alias Realtime.Tenants.Connect.CheckConnection @@ -20,6 +21,7 @@ defmodule Realtime.Tenants.Connect do alias Realtime.Tenants.Connect.Piper alias Realtime.Tenants.Connect.RegisterProcess alias Realtime.Tenants.Migrations + alias Realtime.Tenants.Rebalancer alias Realtime.Tenants.ReplicationConnection alias Realtime.UsersCounter @@ -39,11 +41,8 @@ defmodule Realtime.Tenants.Connect do @doc "Check if Connect has finished setting up connections" def ready?(tenant_id) do case whereis(tenant_id) do - pid when is_pid(pid) -> - GenServer.call(pid, :ready?) - - _ -> - false + pid when is_pid(pid) -> GenServer.call(pid, :ready?) + _ -> false end end @@ -55,24 +54,29 @@ defmodule Realtime.Tenants.Connect do | {:error, :tenant_database_unavailable} | {:error, :initializing} | {:error, :tenant_database_connection_initializing} - | {:error, :tenant_db_too_many_connections} + | {:error, :connect_rate_limit_reached} | {:error, :rpc_error, term()} def lookup_or_start_connection(tenant_id, opts \\ []) when is_binary(tenant_id) do - case get_status(tenant_id) do - {:ok, conn} -> - {:ok, conn} + rate_args = Tenants.connect_per_second_rate(tenant_id) + RateCounter.new(rate_args) - {:error, :tenant_database_unavailable} -> - {:error, :tenant_database_unavailable} + with {:ok, %{limit: %{triggered: false}}} <- RateCounter.get(rate_args), + {:ok, conn} <- get_status(tenant_id) do + {:ok, conn} + else + {:ok, %{limit: %{triggered: true}}} -> + {:error, :connect_rate_limit_reached} {:error, :tenant_database_connection_initializing} -> + GenCounter.add(rate_args.id) call_external_node(tenant_id, opts) {:error, :initializing} -> {:error, :tenant_database_unavailable} - {:error, :tenant_db_too_many_connections} -> - {:error, :tenant_db_too_many_connections} + {:error, reason} -> + GenCounter.add(rate_args.id) + {:error, reason} end end diff --git a/lib/realtime_web/channels/realtime_channel.ex b/lib/realtime_web/channels/realtime_channel.ex index 1d58d9da7..91a417c21 100644 --- a/lib/realtime_web/channels/realtime_channel.ex +++ b/lib/realtime_web/channels/realtime_channel.ex @@ -167,6 +167,10 @@ defmodule RealtimeWeb.RealtimeChannel do msg = "Database can't accept more connections, Realtime won't connect" log_error(socket, "DatabaseLackOfConnections", msg) + {:error, :connect_rate_limit_reached} -> + msg = "Too many database connections attempts per second" + log_error(socket, "DatabaseConnectionRateLimitReached", msg) + {:error, :unable_to_set_policies, error} -> log_error(socket, "UnableToSetPolicies", error) {:error, %{reason: "Realtime was unable to connect to the project database"}} @@ -213,6 +217,9 @@ defmodule RealtimeWeb.RealtimeChannel do {:error, :invalid_replay_params} -> log_error(socket, "UnableToReplayMessages", "Replay params are not valid") + {:error, :invalid_replay_channel} -> + log_error(socket, "UnableToReplayMessages", "Replay is not allowed for public channels") + {:error, error} -> log_error(socket, "UnknownErrorOnChannel", error) {:error, %{reason: "Unknown Error on Channel"}} @@ -790,7 +797,7 @@ defmodule RealtimeWeb.RealtimeChannel do end defp maybe_replay_messages(%{"broadcast" => %{"replay" => _}}, _sub_topic, _db_conn, false = _private?) do - {:error, :invalid_replay_params} + {:error, :invalid_replay_channel} end defp maybe_replay_messages(%{"broadcast" => %{"replay" => replay_params}}, sub_topic, db_conn, true = _private?) diff --git a/mix.exs b/mix.exs index 9c66b3dde..4b0b1f40c 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.3", + version: "2.51.5", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/tenants/connect_test.exs b/test/realtime/tenants/connect_test.exs index 8ba462b27..a52973d53 100644 --- a/test/realtime/tenants/connect_test.exs +++ b/test/realtime/tenants/connect_test.exs @@ -515,6 +515,53 @@ defmodule Realtime.Tenants.ConnectTest do assert capture_log(fn -> assert {:error, :rpc_error, _} = Connect.lookup_or_start_connection("tenant") end) =~ "project=tenant external_id=tenant [error] ErrorOnRpcCall" end + + test "rate limit connect when too many connections against bad database", %{tenant: tenant} do + extension = %{ + "type" => "postgres_cdc_rls", + "settings" => %{ + "db_host" => "127.0.0.1", + "db_name" => "postgres", + "db_user" => "supabase_admin", + "db_password" => "postgres", + "poll_interval" => 100, + "poll_max_changes" => 100, + "poll_max_record_bytes" => 1_048_576, + "region" => "us-east-1", + "ssl_enforced" => true + } + } + + {:ok, tenant} = update_extension(tenant, extension) + + log = + capture_log(fn -> + res = + for _ <- 1..50 do + Process.sleep(200) + Connect.lookup_or_start_connection(tenant.external_id) + end + + assert Enum.any?(res, fn {_, res} -> res == :connect_rate_limit_reached end) + end) + + assert log =~ "DatabaseConnectionRateLimitReached: Too many connection attempts against the tenant database" + end + + test "rate limit connect will not trigger if connection is successful", %{tenant: tenant} do + log = + capture_log(fn -> + res = + for _ <- 1..20 do + Process.sleep(500) + Connect.lookup_or_start_connection(tenant.external_id) + end + + refute Enum.any?(res, fn {_, res} -> res == :tenant_db_too_many_connections end) + end) + + refute log =~ "DatabaseConnectionRateLimitReached: Too many connection attempts against the tenant database" + end end describe "shutdown/1" do diff --git a/test/realtime_web/channels/realtime_channel_test.exs b/test/realtime_web/channels/realtime_channel_test.exs index 0a0d8aca9..ae6c1734a 100644 --- a/test/realtime_web/channels/realtime_channel_test.exs +++ b/test/realtime_web/channels/realtime_channel_test.exs @@ -153,7 +153,7 @@ defmodule RealtimeWeb.RealtimeChannelTest do assert { :error, - %{reason: "UnableToReplayMessages: Replay params are not valid"} + %{reason: "UnableToReplayMessages: Replay is not allowed for public channels"} } = subscribe_and_join(socket, "realtime:test", %{"config" => config}) refute_receive _any diff --git a/test/realtime_web/controllers/broadcast_controller_test.exs b/test/realtime_web/controllers/broadcast_controller_test.exs index 9c38d58bd..7bd426353 100644 --- a/test/realtime_web/controllers/broadcast_controller_test.exs +++ b/test/realtime_web/controllers/broadcast_controller_test.exs @@ -272,6 +272,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) + connect_events_key = Tenants.connect_per_second_rate(tenant).id expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _ -> :ok end) messages_to_send = @@ -290,7 +291,10 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> :ok end) - |> expect(:add, length(messages), fn ^broadcast_events_key -> :ok end) + |> expect(:add, length(messages), fn + ^broadcast_events_key -> :ok + ^connect_events_key -> :ok + end) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) @@ -326,6 +330,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) + connect_events_key = Tenants.connect_per_second_rate(tenant).id expect(TenantBroadcaster, :pubsub_broadcast, 6, fn _, _, _, _ -> :ok end) channels = @@ -354,7 +359,10 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> :ok end) - |> expect(:add, length(messages), fn ^broadcast_events_key -> :ok end) + |> expect(:add, length(messages), fn + ^broadcast_events_key -> :ok + ^connect_events_key -> :ok + end) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) @@ -408,6 +416,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) + connect_events_key = Tenants.connect_per_second_rate(tenant).id expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _ -> :ok end) messages_to_send = @@ -428,7 +437,9 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> :ok end) - |> expect(:add, length(messages_to_send), fn ^broadcast_events_key -> :ok end) + # remove the one message that won't be broadcasted for this user + |> expect(:add, 1, fn ^connect_events_key -> :ok end) + |> expect(:add, length(messages) - 1, fn ^broadcast_events_key -> :ok end) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) @@ -482,7 +493,6 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> 1 end) - |> reject(:add, 1) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) From 352e375046a53f69a6fb9f94435dd1339d3072b4 Mon Sep 17 00:00:00 2001 From: "Al @h0lybyte" <5599058+h0lybyte@users.noreply.github.com> Date: Tue, 7 Oct 2025 02:06:45 -0400 Subject: [PATCH 7/8] =?UTF-8?q?=F0=9F=94=84=20Sync=20with=20upstream=20cha?= =?UTF-8?q?nges=20(#13)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: runtime setup error (#1520) * fix: use primary instead of replica on rename_settings_field (#1521) * feat: upgrade cowboy & ranch (#1523) * fix: Fix GenRpc to not try to connect to nodes that are not alive (#1525) * fix: enable presence on track message (#1527) currently the user would need to have enabled from the beginning of the channel. this will enable users to enable presence later in the flow by sending a track message which will enable presence messages for them * fix: set cowboy active_n=100 as cowboy 2.12.0 (#1530) cowboy 2.13.0 set the default active_n=1 * fix: provide error_code metadata on RealtimeChannel.Logging (#1531) * feat: disable UTF8 validation on websocket frames (#1532) Currently all text frames as handled only with JSON which already requires UTF-8 * fix: move DB setup to happen after Connect.init (#1533) This change reduces the impact of slow DB setup impacting other tenants trying to connect at the same time that landed on the same partition * fix: handle wal bloat (#1528) Verify that replication connection is able to reconnect when faced with WAL bloat issues * feat: replay realtime.messages (#1526) A new index was created on inserted_at DESC, topic WHERE private IS TRUE AND extension = "broadast" The hardcoded limit is 25 for now. * feat: gen_rpc pub sub adapter (#1529) Add a PubSub adapter that uses gen_rpc to send messages to other nodes. It uses :gen_rpc.abcast/3 instead of :erlang.send/2 The adapter works very similarly to the PG2 adapter. It consists of multiple workers that forward to the local node using PubSub.local_broadcast. The way to choose the worker to be used is based on the sending process just like PG2 adapter does The number of workers is controlled by `:pool_size` or `:broadcast_pool_size`. This distinction exists because Phoenix.PubSub uses `:pool_size` to define how many partitions the PubSub registry will use. It's possible to control them separately by using `:broadcast_pool_size` * fix: ensure message id doesn't raise on non-map payloads (#1534) * fix: match error on Connect (#1536) --------- Co-authored-by: Eduardo Gurgel Pinho * feat: websocket max heap size configuration (#1538) * fix: set max process heap size to 500MB instead of 8GB * feat: set websocket transport max heap size WEBSOCKET_MAX_HEAP_SIZE can be used to configure it * fix: update gen_rpc to fix gen_rpc_dispatcher issues (#1537) Issues: * Single gen_rpc_dispatcher that can be a bottleneck if the connecting takes some time * Many calls can land on the dispatcher but the node might be gone already. If we don't validate the node it might keep trying to connect until it times out instead of quickly giving up due to not being an actively connected node. * fix: improve ErlSysMon logging for processes (#1540) Include initial_call, ancestors, registered_name, message_queue_len and total_heap_size Also bump long_schedule and long_gc * fix: make pubsub adapter configurable (#1539) * fix: specify that only private channels are allowed when replaying (#1543) messages * fix: rate limit connect module (#1541) On bad connection, we rate limit the Connect module so we prevent abuses and too much logging of errors * build: automatically cancel old tests/build on new push (#1545) Currently, whenever you push any commit to your branch, the old builds are still running and a new build is started. Once a new commit is added, the old test results no longer matter and it's just a waste of CI resources. Also reduces confusion with multiple builds running in parallel for the same branch/possibly blocking any merges. With this little change, we ensure that whenever a new commit is added, the previous build is immediately canceled/stopped and only the build (latest commit) runs. * fix: move message queue data to off-heap for gen_rpc pub sub workers (#1548) * fix: rate limit Connect.lookup_or_start_connection on error only (#1549) * fix: increase connect error rate window to 30 seconds (#1550) * fix: set a lower fullsweep_after flag for GenRpcPubSub workers (#1551) * fix: hardcode presence limit (#1552) * fix: further decrease limit on presence events (#1553) * fix: bump up realtime (#1554) * fix: lower rate limit to 100 events per second (#1556) * fix: move connect rate limit to socket (#1555) * fix: reduce max_frame_size to 5MB * fix: fullsweep_after=100 on gen rpc pub sub workers --------- Co-authored-by: Eduardo Gurgel Pinho * fix: collect global metrics without tenant tagging (#1557) * feat: presence payload size (#1559) * Also tweak buckets to account all the way to 3000KB * Start tagging the payload size metrics with message_type. message_type can be presence, broadcast or postgres_changes * fix: use GenRpc for Realtime.Latency pings (#1560) * Fastlane for phoenix presence_diff (#1558) It uses a fork of Phoenix for time being * fix: count presence_diff events on MessageDispatcher * fix: remove traces from console during development --------- Co-authored-by: Filipe Cabaço Co-authored-by: Eduardo Gurgel Co-authored-by: Kevin Grüneberg Co-authored-by: Bradley Haljendi <5642609+Fudster@users.noreply.github.com> --- .github/workflows/tests.yml | 4 ++ Makefile | 4 +- config/dev.exs | 4 +- config/runtime.exs | 2 +- .../postgres_cdc_rls/replication_poller.ex | 2 +- lib/realtime/gen_rpc/pub_sub.ex | 6 +- lib/realtime/monitoring/latency.ex | 8 +-- .../monitoring/prom_ex/plugins/tenant.ex | 25 ++++++- .../monitoring/prom_ex/plugins/tenants.ex | 9 +++ lib/realtime/tenants.ex | 14 ++-- lib/realtime/tenants/batch_broadcast.ex | 9 ++- lib/realtime/tenants/connect.ex | 12 +++- lib/realtime_web/channels/presence.ex | 1 + lib/realtime_web/channels/realtime_channel.ex | 17 ----- .../realtime_channel/broadcast_handler.ex | 11 ++- .../realtime_channel/message_dispatcher.ex | 70 ++++++++++-------- .../realtime_channel/presence_handler.ex | 10 +-- .../channels/tenant_rate_limiters.ex | 43 +++++++++++ lib/realtime_web/channels/user_socket.ex | 12 ++++ lib/realtime_web/endpoint.ex | 2 +- lib/realtime_web/tenant_broadcaster.ex | 32 +++++---- mix.exs | 4 +- mix.lock | 2 +- .../extensions/cdc_rls/cdc_rls_test.exs | 18 +++++ test/realtime/gen_rpc_pub_sub_test.exs | 16 +++++ .../prom_ex/plugins/tenant_test.exs | 43 +++++++++-- .../prom_ex/plugins/tenants_test.exs | 33 +++++++++ test/realtime/tenants/connect_test.exs | 30 ++++++++ .../message_dispatcher_test.exs | 71 ++++++++++++++++--- .../presence_handler_test.exs | 32 ++++++++- .../channels/realtime_channel_test.exs | 15 +--- .../channels/tenant_rate_limiters_test.exs | 31 ++++++++ .../controllers/broadcast_controller_test.exs | 26 +++---- test/realtime_web/tenant_broadcaster_test.exs | 49 ++++++++++--- 34 files changed, 523 insertions(+), 144 deletions(-) create mode 100644 lib/realtime_web/channels/tenant_rate_limiters.ex create mode 100644 test/realtime_web/channels/tenant_rate_limiters_test.exs diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5d3818814..c9c2a73fa 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,6 +16,10 @@ on: branches: - main +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: tests: name: Tests diff --git a/Makefile b/Makefile index fd7f0f7fd..1259a1335 100644 --- a/Makefile +++ b/Makefile @@ -9,10 +9,10 @@ PORT ?= 4000 # Common commands dev: ## Start a dev server - ELIXIR_ERL_OPTIONS="+hmax 1000000000" SLOT_NAME_SUFFIX=some_sha PORT=$(PORT) MIX_ENV=dev SECURE_CHANNELS=true API_JWT_SECRET=dev METRICS_JWT_SECRET=dev REGION=fra DB_ENC_KEY="1234567890123456" CLUSTER_STRATEGIES=$(CLUSTER_STRATEGIES) ERL_AFLAGS="-kernel shell_history enabled" GEN_RPC_TCP_SERVER_PORT=5369 GEN_RPC_TCP_CLIENT_PORT=5469 iex --name $(NODE_NAME)@127.0.0.1 --cookie cookie -S mix phx.server + ELIXIR_ERL_OPTIONS="+hmax 1000000000" SLOT_NAME_SUFFIX=some_sha PORT=$(PORT) MIX_ENV=dev SECURE_CHANNELS=true API_JWT_SECRET=dev METRICS_JWT_SECRET=dev REGION=us-east-1 DB_ENC_KEY="1234567890123456" CLUSTER_STRATEGIES=$(CLUSTER_STRATEGIES) ERL_AFLAGS="-kernel shell_history enabled" GEN_RPC_TCP_SERVER_PORT=5369 GEN_RPC_TCP_CLIENT_PORT=5469 iex --name $(NODE_NAME)@127.0.0.1 --cookie cookie -S mix phx.server dev.orange: ## Start another dev server (orange) on port 4001 - ELIXIR_ERL_OPTIONS="+hmax 1000000000" SLOT_NAME_SUFFIX=some_sha PORT=4001 MIX_ENV=dev SECURE_CHANNELS=true API_JWT_SECRET=dev METRICS_JWT_SECRET=dev DB_ENC_KEY="1234567890123456" CLUSTER_STRATEGIES=$(CLUSTER_STRATEGIES) ERL_AFLAGS="-kernel shell_history enabled" GEN_RPC_TCP_SERVER_PORT=5469 GEN_RPC_TCP_CLIENT_PORT=5369 iex --name orange@127.0.0.1 --cookie cookie -S mix phx.server + ELIXIR_ERL_OPTIONS="+hmax 1000000000" SLOT_NAME_SUFFIX=some_sha PORT=4001 MIX_ENV=dev SECURE_CHANNELS=true API_JWT_SECRET=dev METRICS_JWT_SECRET=dev REGION=eu-west-1 DB_ENC_KEY="1234567890123456" CLUSTER_STRATEGIES=$(CLUSTER_STRATEGIES) ERL_AFLAGS="-kernel shell_history enabled" GEN_RPC_TCP_SERVER_PORT=5469 GEN_RPC_TCP_CLIENT_PORT=5369 iex --name orange@127.0.0.1 --cookie cookie -S mix phx.server seed: ## Seed the database DB_ENC_KEY="1234567890123456" FLY_ALLOC_ID=123e4567-e89b-12d3-a456-426614174000 mix run priv/repo/dev_seeds.exs diff --git a/config/dev.exs b/config/dev.exs index a438f8ea4..0eff300d8 100644 --- a/config/dev.exs +++ b/config/dev.exs @@ -97,6 +97,8 @@ config :phoenix, :plug_init_mode, :runtime # Disable caching to ensure the rendered spec is refreshed config :open_api_spex, :cache_adapter, OpenApiSpex.Plug.NoneCache -config :opentelemetry, traces_exporter: {:otel_exporter_stdout, []} +# Disabled but can print to stdout with: +# config :opentelemetry, traces_exporter: {:otel_exporter_stdout, []} +config :opentelemetry, traces_exporter: :none config :mix_test_watch, clear: true diff --git a/config/runtime.exs b/config/runtime.exs index 47961f98a..447934b65 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -68,7 +68,7 @@ janitor_children_timeout = Env.get_integer("JANITOR_CHILDREN_TIMEOUT", :timer.se janitor_schedule_timer = Env.get_integer("JANITOR_SCHEDULE_TIMER_IN_MS", :timer.hours(4)) platform = if System.get_env("AWS_EXECUTION_ENV") == "AWS_ECS_FARGATE", do: :aws, else: :fly broadcast_pool_size = Env.get_integer("BROADCAST_POOL_SIZE", 10) -pubsub_adapter = System.get_env("PUBSUB_ADAPTER", "pg2") |> String.to_atom() +pubsub_adapter = System.get_env("PUBSUB_ADAPTER", "gen_rpc") |> String.to_atom() websocket_max_heap_size = div(Env.get_integer("WEBSOCKET_MAX_HEAP_SIZE", 50_000_000), :erlang.system_info(:wordsize)) no_channel_timeout_in_ms = diff --git a/lib/extensions/postgres_cdc_rls/replication_poller.ex b/lib/extensions/postgres_cdc_rls/replication_poller.ex index 65f4a33f1..85466ebe9 100644 --- a/lib/extensions/postgres_cdc_rls/replication_poller.ex +++ b/lib/extensions/postgres_cdc_rls/replication_poller.ex @@ -183,7 +183,7 @@ defmodule Extensions.PostgresCdcRls.ReplicationPoller do change <- columns |> Enum.zip(row) |> generate_record() |> List.wrap() do topic = "realtime:postgres:" <> tenant_id - RealtimeWeb.TenantBroadcaster.pubsub_broadcast(tenant_id, topic, change, MessageDispatcher) + RealtimeWeb.TenantBroadcaster.pubsub_broadcast(tenant_id, topic, change, MessageDispatcher, :postgres_changes) end {:ok, rows_count} diff --git a/lib/realtime/gen_rpc/pub_sub.ex b/lib/realtime/gen_rpc/pub_sub.ex index b2a90b165..3ba9e053a 100644 --- a/lib/realtime/gen_rpc/pub_sub.ex +++ b/lib/realtime/gen_rpc/pub_sub.ex @@ -65,7 +65,11 @@ defmodule Realtime.GenRpcPubSub.Worker do def start_link({pubsub, worker}), do: GenServer.start_link(__MODULE__, pubsub, name: worker) @impl true - def init(pubsub), do: {:ok, pubsub} + def init(pubsub) do + Process.flag(:message_queue_data, :off_heap) + Process.flag(:fullsweep_after, 100) + {:ok, pubsub} + end @impl true def handle_info({:ftl, topic, message, dispatcher}, pubsub) do diff --git a/lib/realtime/monitoring/latency.ex b/lib/realtime/monitoring/latency.ex index 52c46adb4..d9ddd0d9a 100644 --- a/lib/realtime/monitoring/latency.ex +++ b/lib/realtime/monitoring/latency.ex @@ -7,7 +7,7 @@ defmodule Realtime.Latency do use Realtime.Logs alias Realtime.Nodes - alias Realtime.Rpc + alias Realtime.GenRpc defmodule Payload do @moduledoc false @@ -33,7 +33,7 @@ defmodule Realtime.Latency do } end - @every 5_000 + @every 15_000 def start_link(args) do GenServer.start_link(__MODULE__, args, name: __MODULE__) end @@ -76,7 +76,7 @@ defmodule Realtime.Latency do Task.Supervisor.async(Realtime.TaskSupervisor, fn -> {latency, response} = :timer.tc(fn -> - Rpc.call(n, __MODULE__, :pong, [pong_timeout], timeout: timer_timeout) + GenRpc.call(n, __MODULE__, :pong, [pong_timeout], timeout: timer_timeout) end) latency_ms = latency / 1_000 @@ -85,7 +85,7 @@ defmodule Realtime.Latency do from_node = Nodes.short_node_id_from_name(Node.self()) case response do - {:badrpc, reason} -> + {:error, :rpc_error, reason} -> log_error( "RealtimeNodeDisconnected", "Unable to connect to #{short_name} from #{region}: #{reason}" diff --git a/lib/realtime/monitoring/prom_ex/plugins/tenant.ex b/lib/realtime/monitoring/prom_ex/plugins/tenant.ex index 1bd324624..a3019a68a 100644 --- a/lib/realtime/monitoring/prom_ex/plugins/tenant.ex +++ b/lib/realtime/monitoring/prom_ex/plugins/tenant.ex @@ -36,10 +36,10 @@ defmodule Realtime.PromEx.Plugins.Tenant do event_name: [:realtime, :tenants, :payload, :size], measurement: :size, description: "Tenant payload size", - tags: [:tenant], + tags: [:tenant, :message_type], unit: :byte, reporter_options: [ - buckets: [100, 250, 500, 1000, 2000, 3000, 5000, 10_000, 25_000] + buckets: [250, 500, 1000, 3000, 5000, 10_000, 25_000, 100_000, 500_000, 1_000_000, 3_000_000] ] ), distribution( @@ -47,9 +47,10 @@ defmodule Realtime.PromEx.Plugins.Tenant do event_name: [:realtime, :tenants, :payload, :size], measurement: :size, description: "Payload size", + tags: [:message_type], unit: :byte, reporter_options: [ - buckets: [100, 250, 500, 1000, 2000, 3000, 5000, 10_000, 25_000] + buckets: [250, 500, 1000, 3000, 5000, 10_000, 25_000, 100_000, 500_000, 1_000_000, 3_000_000] ] ) ] @@ -157,6 +158,12 @@ defmodule Realtime.PromEx.Plugins.Tenant do description: "Sum of messages sent on a Realtime Channel.", tags: [:tenant] ), + sum( + [:realtime, :channel, :global, :events], + event_name: [:realtime, :rate_counter, :channel, :events], + measurement: :sum, + description: "Global sum of messages sent on a Realtime Channel." + ), sum( [:realtime, :channel, :presence_events], event_name: [:realtime, :rate_counter, :channel, :presence_events], @@ -164,6 +171,12 @@ defmodule Realtime.PromEx.Plugins.Tenant do description: "Sum of presence messages sent on a Realtime Channel.", tags: [:tenant] ), + sum( + [:realtime, :channel, :global, :presence_events], + event_name: [:realtime, :rate_counter, :channel, :presence_events], + measurement: :sum, + description: "Global sum of presence messages sent on a Realtime Channel." + ), sum( [:realtime, :channel, :db_events], event_name: [:realtime, :rate_counter, :channel, :db_events], @@ -171,6 +184,12 @@ defmodule Realtime.PromEx.Plugins.Tenant do description: "Sum of db messages sent on a Realtime Channel.", tags: [:tenant] ), + sum( + [:realtime, :channel, :global, :db_events], + event_name: [:realtime, :rate_counter, :channel, :db_events], + measurement: :sum, + description: "Global sum of db messages sent on a Realtime Channel." + ), sum( [:realtime, :channel, :joins], event_name: [:realtime, :rate_counter, :channel, :joins], diff --git a/lib/realtime/monitoring/prom_ex/plugins/tenants.ex b/lib/realtime/monitoring/prom_ex/plugins/tenants.ex index 0035e9594..e8106df58 100644 --- a/lib/realtime/monitoring/prom_ex/plugins/tenants.ex +++ b/lib/realtime/monitoring/prom_ex/plugins/tenants.ex @@ -21,6 +21,15 @@ defmodule Realtime.PromEx.Plugins.Tenants do unit: {:microsecond, :millisecond}, tags: [:success, :tenant, :mechanism], reporter_options: [buckets: [10, 250, 5000, 15_000]] + ), + distribution( + [:realtime, :global, :rpc], + event_name: [:realtime, :rpc], + description: "Global Latency of rpc calls", + measurement: :latency, + unit: {:microsecond, :millisecond}, + tags: [:success, :mechanism], + reporter_options: [buckets: [10, 250, 5000, 15_000]] ) ]) end diff --git a/lib/realtime/tenants.ex b/lib/realtime/tenants.ex index db2a02cc4..efd2397ac 100644 --- a/lib/realtime/tenants.ex +++ b/lib/realtime/tenants.ex @@ -328,18 +328,18 @@ defmodule Realtime.Tenants do %RateCounter.Args{id: {:channel, :authorization_errors, external_id}, opts: opts} end - @connect_per_second_default 10 + @connect_errors_per_second_default 10 @doc "RateCounter arguments for counting connect per second." - @spec connect_per_second_rate(Tenant.t() | String.t()) :: RateCounter.Args.t() - def connect_per_second_rate(%Tenant{external_id: external_id}) do - connect_per_second_rate(external_id) + @spec connect_errors_per_second_rate(Tenant.t() | String.t()) :: RateCounter.Args.t() + def connect_errors_per_second_rate(%Tenant{external_id: external_id}) do + connect_errors_per_second_rate(external_id) end - def connect_per_second_rate(tenant_id) do + def connect_errors_per_second_rate(tenant_id) do opts = [ - max_bucket_len: 10, + max_bucket_len: 30, limit: [ - value: @connect_per_second_default, + value: @connect_errors_per_second_default, measurement: :sum, log_fn: fn -> Logger.critical( diff --git a/lib/realtime/tenants/batch_broadcast.ex b/lib/realtime/tenants/batch_broadcast.ex index 98427621b..9e4ed4c3c 100644 --- a/lib/realtime/tenants/batch_broadcast.ex +++ b/lib/realtime/tenants/batch_broadcast.ex @@ -129,7 +129,14 @@ defmodule Realtime.Tenants.BatchBroadcast do broadcast = %Phoenix.Socket.Broadcast{topic: message.topic, event: @event_type, payload: payload} GenCounter.add(events_per_second_rate.id) - TenantBroadcaster.pubsub_broadcast(tenant.external_id, tenant_topic, broadcast, RealtimeChannel.MessageDispatcher) + + TenantBroadcaster.pubsub_broadcast( + tenant.external_id, + tenant_topic, + broadcast, + RealtimeChannel.MessageDispatcher, + :broadcast + ) end defp permissions_for_message(_, nil, _), do: nil diff --git a/lib/realtime/tenants/connect.ex b/lib/realtime/tenants/connect.ex index 0ee43f161..caf49cc57 100644 --- a/lib/realtime/tenants/connect.ex +++ b/lib/realtime/tenants/connect.ex @@ -57,7 +57,7 @@ defmodule Realtime.Tenants.Connect do | {:error, :connect_rate_limit_reached} | {:error, :rpc_error, term()} def lookup_or_start_connection(tenant_id, opts \\ []) when is_binary(tenant_id) do - rate_args = Tenants.connect_per_second_rate(tenant_id) + rate_args = Tenants.connect_errors_per_second_rate(tenant_id) RateCounter.new(rate_args) with {:ok, %{limit: %{triggered: false}}} <- RateCounter.get(rate_args), @@ -68,8 +68,14 @@ defmodule Realtime.Tenants.Connect do {:error, :connect_rate_limit_reached} {:error, :tenant_database_connection_initializing} -> - GenCounter.add(rate_args.id) - call_external_node(tenant_id, opts) + case call_external_node(tenant_id, opts) do + {:ok, pid} -> + {:ok, pid} + + error -> + GenCounter.add(rate_args.id) + error + end {:error, :initializing} -> {:error, :tenant_database_unavailable} diff --git a/lib/realtime_web/channels/presence.ex b/lib/realtime_web/channels/presence.ex index f4d378b92..9e173febe 100644 --- a/lib/realtime_web/channels/presence.ex +++ b/lib/realtime_web/channels/presence.ex @@ -8,5 +8,6 @@ defmodule RealtimeWeb.Presence do use Phoenix.Presence, otp_app: :realtime, pubsub_server: Realtime.PubSub, + dispatcher: RealtimeWeb.RealtimeChannel.MessageDispatcher, pool_size: 10 end diff --git a/lib/realtime_web/channels/realtime_channel.ex b/lib/realtime_web/channels/realtime_channel.ex index 91a417c21..104d9a077 100644 --- a/lib/realtime_web/channels/realtime_channel.ex +++ b/lib/realtime_web/channels/realtime_channel.ex @@ -18,7 +18,6 @@ defmodule RealtimeWeb.RealtimeChannel do alias Realtime.Tenants.Authorization alias Realtime.Tenants.Authorization.Policies alias Realtime.Tenants.Authorization.Policies.BroadcastPolicies - alias Realtime.Tenants.Authorization.Policies.PresencePolicies alias Realtime.Tenants.Connect alias RealtimeWeb.Channels.Payloads.Join @@ -259,27 +258,11 @@ defmodule RealtimeWeb.RealtimeChannel do {:noreply, assign(socket, %{pg_sub_ref: pg_sub_ref})} end - def handle_info( - %{event: "presence_diff"}, - %{assigns: %{policies: %Policies{presence: %PresencePolicies{read: false}}}} = socket - ) do - Logger.warning("Presence message ignored") - {:noreply, socket} - end - def handle_info(_msg, %{assigns: %{policies: %Policies{broadcast: %BroadcastPolicies{read: false}}}} = socket) do Logger.warning("Broadcast message ignored") {:noreply, socket} end - def handle_info(%{event: "presence_diff", payload: payload} = msg, socket) do - %{presence_rate_counter: presence_rate_counter} = socket.assigns - GenCounter.add(presence_rate_counter.id) - maybe_log_info(socket, msg) - push(socket, "presence_diff", payload) - {:noreply, socket} - end - def handle_info(%{event: type, payload: payload} = msg, socket) do count(socket) maybe_log_info(socket, msg) diff --git a/lib/realtime_web/channels/realtime_channel/broadcast_handler.ex b/lib/realtime_web/channels/realtime_channel/broadcast_handler.ex index f8e736c2e..036ad9159 100644 --- a/lib/realtime_web/channels/realtime_channel/broadcast_handler.ex +++ b/lib/realtime_web/channels/realtime_channel/broadcast_handler.ex @@ -76,14 +76,21 @@ defmodule RealtimeWeb.RealtimeChannel.BroadcastHandler do broadcast = %Phoenix.Socket.Broadcast{topic: tenant_topic, event: @event_type, payload: payload} if self_broadcast do - TenantBroadcaster.pubsub_broadcast(tenant_id, tenant_topic, broadcast, RealtimeChannel.MessageDispatcher) + TenantBroadcaster.pubsub_broadcast( + tenant_id, + tenant_topic, + broadcast, + RealtimeChannel.MessageDispatcher, + :broadcast + ) else TenantBroadcaster.pubsub_broadcast_from( tenant_id, self(), tenant_topic, broadcast, - RealtimeChannel.MessageDispatcher + RealtimeChannel.MessageDispatcher, + :broadcast ) end end diff --git a/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex b/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex index 32e1528f3..6604eb2bd 100644 --- a/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex +++ b/lib/realtime_web/channels/realtime_channel/message_dispatcher.ex @@ -5,14 +5,8 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do require Logger - def fastlane_metadata(fastlane_pid, serializer, topic, log_level, tenant_id, replayed_message_ids \\ MapSet.new()) - - def fastlane_metadata(fastlane_pid, serializer, topic, :info, tenant_id, replayed_message_ids) do - {:rc_fastlane, fastlane_pid, serializer, topic, {:log, tenant_id}, replayed_message_ids} - end - - def fastlane_metadata(fastlane_pid, serializer, topic, _log_level, _tenant_id, replayed_message_ids) do - {:rc_fastlane, fastlane_pid, serializer, topic, replayed_message_ids} + def fastlane_metadata(fastlane_pid, serializer, topic, log_level, tenant_id, replayed_message_ids \\ MapSet.new()) do + {:rc_fastlane, fastlane_pid, serializer, topic, log_level, tenant_id, replayed_message_ids} end @doc """ @@ -20,48 +14,58 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do It also sends an :update_rate_counter to the subscriber and it can conditionally log """ @spec dispatch(list, pid, Phoenix.Socket.Broadcast.t()) :: :ok - def dispatch(subscribers, from, %Phoenix.Socket.Broadcast{} = msg) do + def dispatch(subscribers, from, %Phoenix.Socket.Broadcast{event: event} = msg) do # fastlane_pid is the actual socket transport pid # This reduce caches the serialization and bypasses the channel process going straight to the # transport process message_id = message_id(msg.payload) - # Credo doesn't like that we don't use the result aggregation - _ = - Enum.reduce(subscribers, %{}, fn - {pid, _}, cache when pid == from -> - cache + {_cache, count} = + Enum.reduce(subscribers, {%{}, 0}, fn + {pid, _}, {cache, count} when pid == from -> + {cache, count} - {pid, {:rc_fastlane, fastlane_pid, serializer, join_topic, replayed_message_ids}}, cache -> + {pid, {:rc_fastlane, fastlane_pid, serializer, join_topic, log_level, tenant_id, replayed_message_ids}}, + {cache, count} -> if already_replayed?(message_id, replayed_message_ids) do # skip already replayed message - cache + {cache, count} else - send(pid, :update_rate_counter) - do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) - end + if event != "presence_diff", do: send(pid, :update_rate_counter) - {pid, {:rc_fastlane, fastlane_pid, serializer, join_topic, {:log, tenant_id}, replayed_message_ids}}, cache -> - if already_replayed?(message_id, replayed_message_ids) do - # skip already replayed message - cache - else - send(pid, :update_rate_counter) - log = "Received message on #{join_topic} with payload: #{inspect(msg, pretty: true)}" - Logger.info(log, external_id: tenant_id, project: tenant_id) + maybe_log(log_level, join_topic, msg, tenant_id) - do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + cache = do_dispatch(msg, fastlane_pid, serializer, join_topic, cache) + {cache, count + 1} end - {pid, _}, cache -> + {pid, _}, {cache, count} -> send(pid, msg) - cache + {cache, count} end) + tenant_id = tenant_id(subscribers) + increment_presence_counter(tenant_id, event, count) + :ok end + defp increment_presence_counter(tenant_id, "presence_diff", count) when is_binary(tenant_id) do + tenant_id + |> Realtime.Tenants.presence_events_per_second_key() + |> Realtime.GenCounter.add(count) + end + + defp increment_presence_counter(_tenant_id, _event, _count), do: :ok + + defp maybe_log(:info, join_topic, msg, tenant_id) do + log = "Received message on #{join_topic} with payload: #{inspect(msg, pretty: true)}" + Logger.info(log, external_id: tenant_id, project: tenant_id) + end + + defp maybe_log(_level, _join_topic, _msg, _tenant_id), do: :ok + defp message_id(%{"meta" => %{"id" => id}}), do: id defp message_id(_), do: nil @@ -82,4 +86,10 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcher do Map.put(cache, serializer, encoded_msg) end end + + defp tenant_id([{_pid, {:rc_fastlane, _, _, _, _, tenant_id, _}} | _]) do + tenant_id + end + + defp tenant_id(_), do: nil end diff --git a/lib/realtime_web/channels/realtime_channel/presence_handler.ex b/lib/realtime_web/channels/realtime_channel/presence_handler.ex index 9dc23d219..ec16c7b16 100644 --- a/lib/realtime_web/channels/realtime_channel/presence_handler.ex +++ b/lib/realtime_web/channels/realtime_channel/presence_handler.ex @@ -11,7 +11,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do alias Phoenix.Tracker.Shard alias Realtime.GenCounter alias Realtime.RateCounter - alias Realtime.Tenants + # alias Realtime.Tenants alias Realtime.Tenants.Authorization alias RealtimeWeb.Presence alias RealtimeWeb.RealtimeChannel.Logging @@ -109,6 +109,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do %{assigns: %{presence_key: presence_key, tenant_topic: tenant_topic}} = socket payload = Map.get(payload, "payload", %{}) + RealtimeWeb.TenantBroadcaster.collect_payload_size(socket.assigns.tenant, payload, :presence) with :ok <- limit_presence_event(socket), {:ok, _} <- Presence.track(self(), tenant_topic, presence_key, payload) do @@ -138,13 +139,14 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandler do |> Phoenix.Presence.group() end + @presence_limit 100 defp limit_presence_event(socket) do - %{assigns: %{presence_rate_counter: presence_counter, tenant: tenant_id}} = socket + %{assigns: %{presence_rate_counter: presence_counter, tenant: _tenant_id}} = socket {:ok, rate_counter} = RateCounter.get(presence_counter) - tenant = Tenants.Cache.get_tenant_by_external_id(tenant_id) + # tenant = Tenants.Cache.get_tenant_by_external_id(tenant_id) - if rate_counter.avg > tenant.max_presence_events_per_second do + if rate_counter.avg > @presence_limit do {:error, :rate_limit_exceeded} else GenCounter.add(presence_counter.id) diff --git a/lib/realtime_web/channels/tenant_rate_limiters.ex b/lib/realtime_web/channels/tenant_rate_limiters.ex new file mode 100644 index 000000000..2101ac945 --- /dev/null +++ b/lib/realtime_web/channels/tenant_rate_limiters.ex @@ -0,0 +1,43 @@ +defmodule RealtimeWeb.TenantRateLimiters do + @moduledoc """ + Rate limiters for tenants. + """ + require Logger + alias Realtime.UsersCounter + alias Realtime.Tenants + alias Realtime.RateCounter + alias Realtime.Api.Tenant + + @spec check_tenant(Realtime.Api.Tenant.t()) :: :ok | {:error, :too_many_connections | :too_many_joins} + def check_tenant(tenant) do + with :ok <- max_concurrent_users_check(tenant) do + max_joins_per_second_check(tenant) + end + end + + defp max_concurrent_users_check(%Tenant{max_concurrent_users: max_conn_users, external_id: external_id}) do + total_conn_users = UsersCounter.tenant_users(external_id) + + if total_conn_users < max_conn_users, + do: :ok, + else: {:error, :too_many_connections} + end + + defp max_joins_per_second_check(%Tenant{max_joins_per_second: max_joins_per_second} = tenant) do + rate_args = Tenants.joins_per_second_rate(tenant.external_id, max_joins_per_second) + + RateCounter.new(rate_args) + + case RateCounter.get(rate_args) do + {:ok, %{limit: %{triggered: false}}} -> + :ok + + {:ok, %{limit: %{triggered: true}}} -> + {:error, :too_many_joins} + + error -> + Logger.error("UnknownErrorOnCounter: #{inspect(error)}") + {:error, error} + end + end +end diff --git a/lib/realtime_web/channels/user_socket.ex b/lib/realtime_web/channels/user_socket.ex index 849aa052d..6d4bf9017 100644 --- a/lib/realtime_web/channels/user_socket.ex +++ b/lib/realtime_web/channels/user_socket.ex @@ -16,6 +16,7 @@ defmodule RealtimeWeb.UserSocket do alias Realtime.PostgresCdc alias Realtime.Tenants + alias RealtimeWeb.TenantRateLimiters alias RealtimeWeb.ChannelsAuthorization alias RealtimeWeb.RealtimeChannel alias RealtimeWeb.RealtimeChannel.Logging @@ -56,6 +57,7 @@ defmodule RealtimeWeb.UserSocket do token when is_binary(token) <- token, jwt_secret_dec <- Crypto.decrypt!(jwt_secret), {:ok, claims} <- ChannelsAuthorization.authorize_conn(token, jwt_secret_dec, jwt_jwks), + :ok <- TenantRateLimiters.check_tenant(tenant), {:ok, postgres_cdc_module} <- PostgresCdc.driver(postgres_cdc_default) do %Tenant{ extensions: extensions, @@ -111,6 +113,16 @@ defmodule RealtimeWeb.UserSocket do log_error("MalformedJWT", "The token provided is not a valid JWT") {:error, :token_malformed} + {:error, :too_many_connections} -> + msg = "Too many connected users" + Logging.log_error(socket, "ConnectionRateLimitReached", msg) + {:error, :too_many_connections} + + {:error, :too_many_joins} -> + msg = "Too many joins per second" + Logging.log_error(socket, "JoinsRateLimitReached", msg) + {:error, :too_many_joins} + error -> log_error("ErrorConnectingToWebsocket", error) error diff --git a/lib/realtime_web/endpoint.ex b/lib/realtime_web/endpoint.ex index 190e1a917..894911803 100644 --- a/lib/realtime_web/endpoint.ex +++ b/lib/realtime_web/endpoint.ex @@ -15,7 +15,7 @@ defmodule RealtimeWeb.Endpoint do websocket: [ connect_info: [:peer_data, :uri, :x_headers], fullsweep_after: 20, - max_frame_size: 8_000_000, + max_frame_size: 5_000_000, # https://github.com/ninenines/cowboy/blob/24d32de931a0c985ff7939077463fc8be939f0e9/doc/src/manual/cowboy_websocket.asciidoc#L228 # active_n: The number of packets Cowboy will request from the socket at once. # This can be used to tweak the performance of the server. Higher values reduce diff --git a/lib/realtime_web/tenant_broadcaster.ex b/lib/realtime_web/tenant_broadcaster.ex index da02df79e..f8b739a0b 100644 --- a/lib/realtime_web/tenant_broadcaster.ex +++ b/lib/realtime_web/tenant_broadcaster.ex @@ -5,9 +5,12 @@ defmodule RealtimeWeb.TenantBroadcaster do alias Phoenix.PubSub - @spec pubsub_broadcast(tenant_id :: String.t(), PubSub.topic(), PubSub.message(), PubSub.dispatcher()) :: :ok - def pubsub_broadcast(tenant_id, topic, message, dispatcher) do - collect_payload_size(tenant_id, message) + @type message_type :: :broadcast | :presence | :postgres_changes + + @spec pubsub_broadcast(tenant_id :: String.t(), PubSub.topic(), PubSub.message(), PubSub.dispatcher(), message_type) :: + :ok + def pubsub_broadcast(tenant_id, topic, message, dispatcher, message_type) do + collect_payload_size(tenant_id, message, message_type) if pubsub_adapter() == :gen_rpc do PubSub.broadcast(Realtime.PubSub, topic, message, dispatcher) @@ -23,11 +26,12 @@ defmodule RealtimeWeb.TenantBroadcaster do from :: pid, PubSub.topic(), PubSub.message(), - PubSub.dispatcher() + PubSub.dispatcher(), + message_type ) :: :ok - def pubsub_broadcast_from(tenant_id, from, topic, message, dispatcher) do - collect_payload_size(tenant_id, message) + def pubsub_broadcast_from(tenant_id, from, topic, message, dispatcher, message_type) do + collect_payload_size(tenant_id, message, message_type) if pubsub_adapter() == :gen_rpc do PubSub.broadcast_from(Realtime.PubSub, from, topic, message, dispatcher) @@ -45,16 +49,18 @@ defmodule RealtimeWeb.TenantBroadcaster do @payload_size_event [:realtime, :tenants, :payload, :size] - defp collect_payload_size(tenant_id, payload) when is_struct(payload) do + @spec collect_payload_size(tenant_id :: String.t(), payload :: term, message_type :: message_type) :: :ok + def collect_payload_size(tenant_id, payload, message_type) when is_struct(payload) do # Extracting from struct so the __struct__ bit is not calculated as part of the payload - collect_payload_size(tenant_id, Map.from_struct(payload)) + collect_payload_size(tenant_id, Map.from_struct(payload), message_type) end - defp collect_payload_size(tenant_id, payload) do - :telemetry.execute(@payload_size_event, %{size: :erlang.external_size(payload)}, %{tenant: tenant_id}) + def collect_payload_size(tenant_id, payload, message_type) do + :telemetry.execute(@payload_size_event, %{size: :erlang.external_size(payload)}, %{ + tenant: tenant_id, + message_type: message_type + }) end - defp pubsub_adapter do - Application.fetch_env!(:realtime, :pubsub_adapter) - end + defp pubsub_adapter, do: Application.fetch_env!(:realtime, :pubsub_adapter) end diff --git a/mix.exs b/mix.exs index 4b0b1f40c..d0e42bf11 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.51.5", + version: "2.53.0", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, @@ -53,7 +53,7 @@ defmodule Realtime.MixProject do # Type `mix help deps` for examples and options. defp deps do [ - {:phoenix, "~> 1.7.0"}, + {:phoenix, override: true, github: "supabase/phoenix", branch: "feat/presence-custom-dispatcher-1.7.19"}, {:phoenix_ecto, "~> 4.4.0"}, {:ecto_sql, "~> 3.11"}, {:ecto_psql_extras, "~> 0.8"}, diff --git a/mix.lock b/mix.lock index c5fce6022..ba6f47328 100644 --- a/mix.lock +++ b/mix.lock @@ -66,7 +66,7 @@ "opentelemetry_semantic_conventions": {:hex, :opentelemetry_semantic_conventions, "1.27.0", "acd0194a94a1e57d63da982ee9f4a9f88834ae0b31b0bd850815fe9be4bbb45f", [:mix, :rebar3], [], "hexpm", "9681ccaa24fd3d810b4461581717661fd85ff7019b082c2dff89c7d5b1fc2864"}, "opentelemetry_telemetry": {:hex, :opentelemetry_telemetry, "1.1.2", "410ab4d76b0921f42dbccbe5a7c831b8125282850be649ee1f70050d3961118a", [:mix, :rebar3], [{:opentelemetry_api, "~> 1.3", [hex: :opentelemetry_api, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.1", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "641ab469deb181957ac6d59bce6e1321d5fe2a56df444fc9c19afcad623ab253"}, "otel_http": {:hex, :otel_http, "0.2.0", "b17385986c7f1b862f5d577f72614ecaa29de40392b7618869999326b9a61d8a", [:rebar3], [], "hexpm", "f2beadf922c8cfeb0965488dd736c95cc6ea8b9efce89466b3904d317d7cc717"}, - "phoenix": {:hex, :phoenix, "1.7.19", "36617efe5afbd821099a8b994ff4618a340a5bfb25531a1802c4d4c634017a57", [:mix], [{:castore, ">= 0.0.0", [hex: :castore, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:phoenix_pubsub, "~> 2.1", [hex: :phoenix_pubsub, repo: "hexpm", optional: false]}, {:phoenix_template, "~> 1.0", [hex: :phoenix_template, repo: "hexpm", optional: false]}, {:phoenix_view, "~> 2.0", [hex: :phoenix_view, repo: "hexpm", optional: true]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.7", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:plug_crypto, "~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}, {:websock_adapter, "~> 0.5.3", [hex: :websock_adapter, repo: "hexpm", optional: false]}], "hexpm", "ba4dc14458278773f905f8ae6c2ec743d52c3a35b6b353733f64f02dfe096cd6"}, + "phoenix": {:git, "https://github.com/supabase/phoenix.git", "7b884cc0cc1a49ad2bc272acda2e622b3e11c139", [branch: "feat/presence-custom-dispatcher-1.7.19"]}, "phoenix_ecto": {:hex, :phoenix_ecto, "4.4.3", "86e9878f833829c3f66da03d75254c155d91d72a201eb56ae83482328dc7ca93", [:mix], [{:ecto, "~> 3.5", [hex: :ecto, repo: "hexpm", optional: false]}, {:phoenix_html, "~> 2.14.2 or ~> 3.0 or ~> 4.0", [hex: :phoenix_html, repo: "hexpm", optional: true]}, {:plug, "~> 1.9", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "d36c401206f3011fefd63d04e8ef626ec8791975d9d107f9a0817d426f61ac07"}, "phoenix_html": {:hex, :phoenix_html, "3.3.4", "42a09fc443bbc1da37e372a5c8e6755d046f22b9b11343bf885067357da21cb3", [:mix], [{:plug, "~> 1.5", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "0249d3abec3714aff3415e7ee3d9786cb325be3151e6c4b3021502c585bf53fb"}, "phoenix_live_dashboard": {:hex, :phoenix_live_dashboard, "0.8.6", "7b1f0327f54c9eb69845fd09a77accf922f488c549a7e7b8618775eb603a62c7", [:mix], [{:ecto, "~> 3.6.2 or ~> 3.7", [hex: :ecto, repo: "hexpm", optional: true]}, {:ecto_mysql_extras, "~> 0.5", [hex: :ecto_mysql_extras, repo: "hexpm", optional: true]}, {:ecto_psql_extras, "~> 0.7", [hex: :ecto_psql_extras, repo: "hexpm", optional: true]}, {:ecto_sqlite3_extras, "~> 1.1.7 or ~> 1.2.0", [hex: :ecto_sqlite3_extras, repo: "hexpm", optional: true]}, {:mime, "~> 1.6 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:phoenix_live_view, "~> 0.19 or ~> 1.0", [hex: :phoenix_live_view, repo: "hexpm", optional: false]}, {:telemetry_metrics, "~> 0.6 or ~> 1.0", [hex: :telemetry_metrics, repo: "hexpm", optional: false]}], "hexpm", "1681ab813ec26ca6915beb3414aa138f298e17721dc6a2bde9e6eb8a62360ff6"}, diff --git a/test/realtime/extensions/cdc_rls/cdc_rls_test.exs b/test/realtime/extensions/cdc_rls/cdc_rls_test.exs index 5f341c134..d12c0ba73 100644 --- a/test/realtime/extensions/cdc_rls/cdc_rls_test.exs +++ b/test/realtime/extensions/cdc_rls/cdc_rls_test.exs @@ -236,6 +236,15 @@ defmodule Realtime.Extensions.CdcRlsTest do RateCounter.stop(tenant.external_id) + on_exit(fn -> :telemetry.detach(__MODULE__) end) + + :telemetry.attach( + __MODULE__, + [:realtime, :tenants, :payload, :size], + &__MODULE__.handle_telemetry/4, + pid: self() + ) + %{tenant: tenant, conn: conn} end @@ -317,6 +326,13 @@ defmodule Realtime.Extensions.CdcRlsTest do assert {:ok, %RateCounter{id: {:channel, :db_events, "dev_tenant"}, bucket: bucket}} = RateCounter.get(rate) assert 1 in bucket + + assert_receive { + :telemetry, + [:realtime, :tenants, :payload, :size], + %{size: 341}, + %{tenant: "dev_tenant", message_type: :postgres_changes} + } end @aux_mod (quote do @@ -414,4 +430,6 @@ defmodule Realtime.Extensions.CdcRlsTest do :erpc.call(node, PostgresCdcRls, :handle_stop, [tenant.external_id, 10_000]) end end + + def handle_telemetry(event, measures, metadata, pid: pid), do: send(pid, {:telemetry, event, measures, metadata}) end diff --git a/test/realtime/gen_rpc_pub_sub_test.exs b/test/realtime/gen_rpc_pub_sub_test.exs index 0013c2e7b..517c6c369 100644 --- a/test/realtime/gen_rpc_pub_sub_test.exs +++ b/test/realtime/gen_rpc_pub_sub_test.exs @@ -1,2 +1,18 @@ Application.put_env(:phoenix_pubsub, :test_adapter, {Realtime.GenRpcPubSub, []}) Code.require_file("../../deps/phoenix_pubsub/test/shared/pubsub_test.exs", __DIR__) + +defmodule Realtime.GenRpcPubSubTest do + use ExUnit.Case, async: true + + test "it sets off_heap message_queue_data flag on the workers" do + assert Realtime.PubSubElixir.Realtime.PubSub.Adapter_1 + |> Process.whereis() + |> Process.info(:message_queue_data) == {:message_queue_data, :off_heap} + end + + test "it sets fullsweep_after flag on the workers" do + assert Realtime.PubSubElixir.Realtime.PubSub.Adapter_1 + |> Process.whereis() + |> Process.info(:fullsweep_after) == {:fullsweep_after, 100} + end +end diff --git a/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs b/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs index 164c8d2eb..77c1dc7cf 100644 --- a/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs +++ b/test/realtime/monitoring/prom_ex/plugins/tenant_test.exs @@ -129,6 +129,17 @@ defmodule Realtime.PromEx.Plugins.TenantTest do assert metric_value(pattern) == metric_value + 1 end + test "global event exists after counter added", %{tenant: %{external_id: external_id}} do + pattern = + ~r/realtime_channel_global_events\s(?\d+)/ + + metric_value = metric_value(pattern) + FakeUserCounter.fake_event(external_id) + + Process.sleep(200) + assert metric_value(pattern) == metric_value + 1 + end + test "db_event exists after counter added", %{tenant: %{external_id: external_id}} do pattern = ~r/realtime_channel_db_events{tenant="#{external_id}"}\s(?\d+)/ @@ -139,6 +150,16 @@ defmodule Realtime.PromEx.Plugins.TenantTest do assert metric_value(pattern) == metric_value + 1 end + test "global db_event exists after counter added", %{tenant: %{external_id: external_id}} do + pattern = + ~r/realtime_channel_global_db_events\s(?\d+)/ + + metric_value = metric_value(pattern) + FakeUserCounter.fake_db_event(external_id) + Process.sleep(200) + assert metric_value(pattern) == metric_value + 1 + end + test "presence_event exists after counter added", %{tenant: %{external_id: external_id}} do pattern = ~r/realtime_channel_presence_events{tenant="#{external_id}"}\s(?\d+)/ @@ -149,6 +170,16 @@ defmodule Realtime.PromEx.Plugins.TenantTest do assert metric_value(pattern) == metric_value + 1 end + test "global presence_event exists after counter added", %{tenant: %{external_id: external_id}} do + pattern = + ~r/realtime_channel_global_presence_events\s(?\d+)/ + + metric_value = metric_value(pattern) + FakeUserCounter.fake_presence_event(external_id) + Process.sleep(200) + assert metric_value(pattern) == metric_value + 1 + end + test "metric read_authorization_check exists after check", context do pattern = ~r/realtime_tenants_read_authorization_check_count{tenant="#{context.tenant.external_id}"}\s(?\d+)/ @@ -231,18 +262,18 @@ defmodule Realtime.PromEx.Plugins.TenantTest do external_id = context.tenant.external_id pattern = - ~r/realtime_tenants_payload_size_count{tenant="#{external_id}"}\s(?\d+)/ + ~r/realtime_tenants_payload_size_count{message_type=\"presence\",tenant="#{external_id}"}\s(?\d+)/ metric_value = metric_value(pattern) message = %{topic: "a topic", event: "an event", payload: ["a", %{"b" => "c"}, 1, 23]} - RealtimeWeb.TenantBroadcaster.pubsub_broadcast(external_id, "a topic", message, Phoenix.PubSub) + RealtimeWeb.TenantBroadcaster.pubsub_broadcast(external_id, "a topic", message, Phoenix.PubSub, :presence) Process.sleep(200) assert metric_value(pattern) == metric_value + 1 bucket_pattern = - ~r/realtime_tenants_payload_size_bucket{tenant="#{external_id}",le="100"}\s(?\d+)/ + ~r/realtime_tenants_payload_size_bucket{message_type=\"presence\",tenant="#{external_id}",le="250"}\s(?\d+)/ assert metric_value(bucket_pattern) > 0 end @@ -250,17 +281,17 @@ defmodule Realtime.PromEx.Plugins.TenantTest do test "global metric payload size", context do external_id = context.tenant.external_id - pattern = ~r/realtime_payload_size_count\s(?\d+)/ + pattern = ~r/realtime_payload_size_count{message_type=\"broadcast\"}\s(?\d+)/ metric_value = metric_value(pattern) message = %{topic: "a topic", event: "an event", payload: ["a", %{"b" => "c"}, 1, 23]} - RealtimeWeb.TenantBroadcaster.pubsub_broadcast(external_id, "a topic", message, Phoenix.PubSub) + RealtimeWeb.TenantBroadcaster.pubsub_broadcast(external_id, "a topic", message, Phoenix.PubSub, :broadcast) Process.sleep(200) assert metric_value(pattern) == metric_value + 1 - bucket_pattern = ~r/realtime_payload_size_bucket{le="100"}\s(?\d+)/ + bucket_pattern = ~r/realtime_payload_size_bucket{message_type=\"broadcast\",le="250"}\s(?\d+)/ assert metric_value(bucket_pattern) > 0 end diff --git a/test/realtime/monitoring/prom_ex/plugins/tenants_test.exs b/test/realtime/monitoring/prom_ex/plugins/tenants_test.exs index 080fd3cfb..ded087c74 100644 --- a/test/realtime/monitoring/prom_ex/plugins/tenants_test.exs +++ b/test/realtime/monitoring/prom_ex/plugins/tenants_test.exs @@ -37,6 +37,16 @@ defmodule Realtime.PromEx.Plugins.TenantsTest do assert metric_value(pattern) == previous_value + 1 end + test "global success" do + pattern = ~r/realtime_global_rpc_count{mechanism=\"erpc\",success="true"}\s(?\d+)/ + # Enough time for the poll rate to be triggered at least once + Process.sleep(200) + previous_value = metric_value(pattern) + assert {:ok, "success"} = Rpc.enhanced_call(node(), Test, :success, [], tenant_id: "123") + Process.sleep(200) + assert metric_value(pattern) == previous_value + 1 + end + test "failure" do pattern = ~r/realtime_rpc_count{mechanism=\"erpc\",success="false",tenant="123"}\s(?\d+)/ # Enough time for the poll rate to be triggered at least once @@ -47,6 +57,16 @@ defmodule Realtime.PromEx.Plugins.TenantsTest do assert metric_value(pattern) == previous_value + 1 end + test "global failure" do + pattern = ~r/realtime_global_rpc_count{mechanism=\"erpc\",success="false"}\s(?\d+)/ + # Enough time for the poll rate to be triggered at least once + Process.sleep(200) + previous_value = metric_value(pattern) + assert {:error, "failure"} = Rpc.enhanced_call(node(), Test, :failure, [], tenant_id: "123") + Process.sleep(200) + assert metric_value(pattern) == previous_value + 1 + end + test "exception" do pattern = ~r/realtime_rpc_count{mechanism=\"erpc\",success="false",tenant="123"}\s(?\d+)/ # Enough time for the poll rate to be triggered at least once @@ -59,6 +79,19 @@ defmodule Realtime.PromEx.Plugins.TenantsTest do Process.sleep(200) assert metric_value(pattern) == previous_value + 1 end + + test "global exception" do + pattern = ~r/realtime_global_rpc_count{mechanism=\"erpc\",success="false"}\s(?\d+)/ + # Enough time for the poll rate to be triggered at least once + Process.sleep(200) + previous_value = metric_value(pattern) + + assert {:error, :rpc_error, %RuntimeError{message: "runtime error"}} = + Rpc.enhanced_call(node(), Test, :exception, [], tenant_id: "123") + + Process.sleep(200) + assert metric_value(pattern) == previous_value + 1 + end end test "event_metrics rpc" do diff --git a/test/realtime/tenants/connect_test.exs b/test/realtime/tenants/connect_test.exs index a52973d53..741f6ecf7 100644 --- a/test/realtime/tenants/connect_test.exs +++ b/test/realtime/tenants/connect_test.exs @@ -51,6 +51,36 @@ defmodule Realtime.Tenants.ConnectTest do end describe "handle cold start" do + test "multiple processes connecting calling Connect.connect", %{tenant: tenant} do + parent = self() + + # Let's slow down Connect.connect so that multiple RPC calls are executed + stub(Connect, :connect, fn x, y, z -> + :timer.sleep(1000) + call_original(Connect, :connect, [x, y, z]) + end) + + connect = fn -> send(parent, Connect.lookup_or_start_connection(tenant.external_id)) end + # Let's call enough times to potentially trigger the Connect RateCounter + + for _ <- 1..50, do: spawn(connect) + + assert_receive({:ok, pid}, 1100) + + for _ <- 1..49, do: assert_receive({:ok, ^pid}) + + # Does not trigger rate limit as connections eventually succeeded + + {:ok, rate_counter} = + tenant.external_id + |> Tenants.connect_errors_per_second_rate() + |> Realtime.RateCounter.get() + + assert rate_counter.sum == 0 + assert rate_counter.avg == 0.0 + assert rate_counter.limit.triggered == false + end + test "multiple proccesses succeed together", %{tenant: tenant} do parent = self() diff --git a/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs b/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs index 44ce83b99..53be2e51f 100644 --- a/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs +++ b/test/realtime_web/channels/realtime_channel/message_dispatcher_test.exs @@ -16,12 +16,24 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do describe "fastlane_metadata/5" do test "info level" do assert MessageDispatcher.fastlane_metadata(self(), Serializer, "realtime:topic", :info, "tenant_id") == - {:rc_fastlane, self(), Serializer, "realtime:topic", {:log, "tenant_id"}, MapSet.new()} + {:rc_fastlane, self(), Serializer, "realtime:topic", :info, "tenant_id", MapSet.new()} end test "non-info level" do assert MessageDispatcher.fastlane_metadata(self(), Serializer, "realtime:topic", :warning, "tenant_id") == - {:rc_fastlane, self(), Serializer, "realtime:topic", MapSet.new()} + {:rc_fastlane, self(), Serializer, "realtime:topic", :warning, "tenant_id", MapSet.new()} + end + + test "replayed message ids" do + assert MessageDispatcher.fastlane_metadata( + self(), + Serializer, + "realtime:topic", + :warning, + "tenant_id", + MapSet.new([1]) + ) == + {:rc_fastlane, self(), Serializer, "realtime:topic", :warning, "tenant_id", MapSet.new([1])} end end @@ -50,8 +62,8 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do from_pid = :erlang.list_to_pid(~c'<0.2.1>') subscribers = [ - {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, MapSet.new()}}, - {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", MapSet.new()}} + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :info, "tenant123", MapSet.new()}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :warning, "tenant123", MapSet.new()}} ] msg = %Broadcast{topic: "some:other:topic", event: "event", payload: %{data: "test"}} @@ -74,6 +86,48 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do refute_receive _any end + test "dispatches 'presence_diff' messages to fastlane subscribers" do + parent = self() + + subscriber_pid = + spawn(fn -> + loop = fn loop -> + receive do + msg -> + send(parent, {:subscriber, msg}) + loop.(loop) + end + end + + loop.(loop) + end) + + from_pid = :erlang.list_to_pid(~c'<0.2.1>') + + subscribers = [ + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :info, "tenant456", MapSet.new()}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :warning, "tenant456", MapSet.new()}} + ] + + msg = %Broadcast{topic: "some:other:topic", event: "presence_diff", payload: %{data: "test"}} + + log = + capture_log(fn -> + assert MessageDispatcher.dispatch(subscribers, from_pid, msg) == :ok + end) + + assert log =~ "Received message on realtime:topic with payload: #{inspect(msg, pretty: true)}" + + assert_receive {:encoded, %Broadcast{event: "presence_diff", payload: %{data: "test"}, topic: "realtime:topic"}} + assert_receive {:encoded, %Broadcast{event: "presence_diff", payload: %{data: "test"}, topic: "realtime:topic"}} + + assert Agent.get(TestSerializer, & &1) == 1 + + assert Realtime.GenCounter.get(Realtime.Tenants.presence_events_per_second_key("tenant456")) == 2 + + refute_receive _any + end + test "does not dispatch messages to fastlane subscribers if they already replayed it" do parent = self() @@ -95,8 +149,9 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do subscribers = [ {subscriber_pid, - {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, replaeyd_message_ids}}, - {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", replaeyd_message_ids}} + {:rc_fastlane, self(), TestSerializer, "realtime:topic", :info, "tenant123", replaeyd_message_ids}}, + {subscriber_pid, + {:rc_fastlane, self(), TestSerializer, "realtime:topic", :warning, "tenant123", replaeyd_message_ids}} ] msg = %Broadcast{ @@ -131,8 +186,8 @@ defmodule RealtimeWeb.RealtimeChannel.MessageDispatcherTest do from_pid = :erlang.list_to_pid(~c'<0.2.1>') subscribers = [ - {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", {:log, "tenant123"}, MapSet.new()}}, - {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", MapSet.new()}} + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :info, "tenant123", MapSet.new()}}, + {subscriber_pid, {:rc_fastlane, self(), TestSerializer, "realtime:topic", :warning, "tenant123", MapSet.new()}} ] msg = %Broadcast{topic: "some:other:topic", event: "event", payload: "not a map"} diff --git a/test/realtime_web/channels/realtime_channel/presence_handler_test.exs b/test/realtime_web/channels/realtime_channel/presence_handler_test.exs index 0cdf422e2..219f13e55 100644 --- a/test/realtime_web/channels/realtime_channel/presence_handler_test.exs +++ b/test/realtime_web/channels/realtime_channel/presence_handler_test.exs @@ -100,25 +100,41 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do end describe "handle/3" do + setup do + on_exit(fn -> :telemetry.detach(__MODULE__) end) + + :telemetry.attach( + __MODULE__, + [:realtime, :tenants, :payload, :size], + &__MODULE__.handle_telemetry/4, + pid: self() + ) + end + test "with true policy and is private, user can track their presence and changes", %{ tenant: tenant, topic: topic, db_conn: db_conn } do + external_id = tenant.external_id key = random_string() policies = %Policies{presence: %PresencePolicies{read: true, write: true}} socket = socket_fixture(tenant, topic, key, policies: policies) - PresenceHandler.handle(%{"event" => "track"}, db_conn, socket) + PresenceHandler.handle(%{"event" => "track", "payload" => %{"A" => "b", "c" => "b"}}, db_conn, socket) topic = socket.assigns.tenant_topic assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} assert Map.has_key?(joins, key) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 30}, + %{tenant: ^external_id, message_type: :presence}} end test "when tracking already existing user, metadata updated", %{tenant: tenant, topic: topic, db_conn: db_conn} do + external_id = tenant.external_id key = random_string() policies = %Policies{presence: %PresencePolicies{read: true, write: true}} socket = socket_fixture(tenant, topic, key, policies: policies) @@ -134,10 +150,18 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} assert Map.has_key?(joins, key) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 6}, + %{tenant: ^external_id, message_type: :presence}} + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 55}, + %{tenant: ^external_id, message_type: :presence}} + refute_receive :_ end test "with false policy and is public, user can track their presence and changes", %{tenant: tenant, topic: topic} do + external_id = tenant.external_id key = random_string() policies = %Policies{presence: %PresencePolicies{read: false, write: false}} socket = socket_fixture(tenant, topic, key, policies: policies, private?: false) @@ -147,6 +171,9 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do topic = socket.assigns.tenant_topic assert_receive %Broadcast{topic: ^topic, event: "presence_diff", payload: %{joins: joins, leaves: %{}}} assert Map.has_key?(joins, key) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 6}, + %{tenant: ^external_id, message_type: :presence}} end test "user can untrack when they want", %{tenant: tenant, topic: topic, db_conn: db_conn} do @@ -434,6 +461,7 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do assert log =~ "PresenceRateLimitReached" end + @tag :skip @tag policies: [:authenticated_read_broadcast_and_presence, :authenticated_write_broadcast_and_presence] test "respects rate limits on private channels", %{tenant: tenant, topic: topic, db_conn: db_conn} do key = random_string() @@ -517,4 +545,6 @@ defmodule RealtimeWeb.RealtimeChannel.PresenceHandlerTest do } } end + + def handle_telemetry(event, measures, metadata, pid: pid), do: send(pid, {:telemetry, event, measures, metadata}) end diff --git a/test/realtime_web/channels/realtime_channel_test.exs b/test/realtime_web/channels/realtime_channel_test.exs index ae6c1734a..8022d6ebd 100644 --- a/test/realtime_web/channels/realtime_channel_test.exs +++ b/test/realtime_web/channels/realtime_channel_test.exs @@ -239,23 +239,14 @@ defmodule RealtimeWeb.RealtimeChannelTest do end describe "presence" do - test "events are counted", %{tenant: tenant} do + test "presence state event is counted", %{tenant: tenant} do jwt = Generators.generate_jwt_token(tenant) {:ok, %Socket{} = socket} = connect(UserSocket, %{"log_level" => "warning"}, conn_opts(tenant, jwt)) assert {:ok, _, %Socket{} = socket} = subscribe_and_join(socket, "realtime:test", %{}) - presence_diff = %Socket.Broadcast{event: "presence_diff", payload: %{joins: %{}, leaves: %{}}} - send(socket.channel_pid, presence_diff) - assert_receive %Socket.Message{topic: "realtime:test", event: "presence_state", payload: %{}} - assert_receive %Socket.Message{ - topic: "realtime:test", - event: "presence_diff", - payload: %{joins: %{}, leaves: %{}} - } - tenant_id = tenant.external_id # Wait for RateCounter to tick @@ -264,8 +255,8 @@ defmodule RealtimeWeb.RealtimeChannelTest do assert {:ok, %RateCounter{id: {:channel, :presence_events, ^tenant_id}, bucket: bucket}} = RateCounter.get(socket.assigns.presence_rate_counter) - # presence_state + presence_diff - assert 2 in bucket + # presence_state + assert Enum.sum(bucket) == 1 end end diff --git a/test/realtime_web/channels/tenant_rate_limiters_test.exs b/test/realtime_web/channels/tenant_rate_limiters_test.exs new file mode 100644 index 000000000..05d56ec82 --- /dev/null +++ b/test/realtime_web/channels/tenant_rate_limiters_test.exs @@ -0,0 +1,31 @@ +defmodule RealtimeWeb.TenantRateLimitersTest do + use Realtime.DataCase, async: true + + use Mimic + alias RealtimeWeb.TenantRateLimiters + alias Realtime.Api.Tenant + + setup do + tenant = %Tenant{external_id: random_string(), max_concurrent_users: 1, max_joins_per_second: 1} + + %{tenant: tenant} + end + + describe "check_tenant/1" do + test "rate is not exceeded", %{tenant: tenant} do + assert TenantRateLimiters.check_tenant(tenant) == :ok + end + + test "max concurrent users is exceeded", %{tenant: tenant} do + Realtime.UsersCounter.add(self(), tenant.external_id) + + assert TenantRateLimiters.check_tenant(tenant) == {:error, :too_many_connections} + end + + test "max joins is exceeded", %{tenant: tenant} do + expect(Realtime.RateCounter, :get, fn _ -> {:ok, %{limit: %{triggered: true}}} end) + + assert TenantRateLimiters.check_tenant(tenant) == {:error, :too_many_joins} + end + end +end diff --git a/test/realtime_web/controllers/broadcast_controller_test.exs b/test/realtime_web/controllers/broadcast_controller_test.exs index 7bd426353..900eb7aa9 100644 --- a/test/realtime_web/controllers/broadcast_controller_test.exs +++ b/test/realtime_web/controllers/broadcast_controller_test.exs @@ -272,8 +272,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) - connect_events_key = Tenants.connect_per_second_rate(tenant).id - expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _ -> :ok end) + expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _, _ -> :ok end) messages_to_send = Stream.repeatedly(fn -> generate_message_with_policies(db_conn, tenant) end) @@ -298,7 +297,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) - broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/4) + broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/5) Enum.each(messages_to_send, fn %{topic: topic} -> broadcast_topic = Tenants.tenant_topic(tenant, topic, false) @@ -314,7 +313,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } assert Enum.any?(broadcast_calls, fn - [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher] -> true + [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher, :broadcast] -> true _ -> false end) end) @@ -330,8 +329,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) - connect_events_key = Tenants.connect_per_second_rate(tenant).id - expect(TenantBroadcaster, :pubsub_broadcast, 6, fn _, _, _, _ -> :ok end) + expect(TenantBroadcaster, :pubsub_broadcast, 6, fn _, _, _, _, _ -> :ok end) channels = Stream.repeatedly(fn -> generate_message_with_policies(db_conn, tenant) end) @@ -366,7 +364,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) - broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/4) + broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/5) Enum.each(channels, fn %{topic: topic} -> broadcast_topic = Tenants.tenant_topic(tenant, topic, false) @@ -382,7 +380,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } assert Enum.count(broadcast_calls, fn - [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher] -> true + [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher, :broadcast] -> true _ -> false end) == 1 end) @@ -401,7 +399,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do open_channel_topic = Tenants.tenant_topic(tenant, "open_channel", true) assert Enum.count(broadcast_calls, fn - [_, ^open_channel_topic, ^message, RealtimeChannel.MessageDispatcher] -> true + [_, ^open_channel_topic, ^message, RealtimeChannel.MessageDispatcher, :broadcast] -> true _ -> false end) == 1 @@ -416,8 +414,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } do request_events_key = Tenants.requests_per_second_key(tenant) broadcast_events_key = Tenants.events_per_second_key(tenant) - connect_events_key = Tenants.connect_per_second_rate(tenant).id - expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _ -> :ok end) + expect(TenantBroadcaster, :pubsub_broadcast, 5, fn _, _, _, _, _ -> :ok end) messages_to_send = Stream.repeatedly(fn -> generate_message_with_policies(db_conn, tenant) end) @@ -438,12 +435,11 @@ defmodule RealtimeWeb.BroadcastControllerTest do GenCounter |> expect(:add, fn ^request_events_key -> :ok end) # remove the one message that won't be broadcasted for this user - |> expect(:add, 1, fn ^connect_events_key -> :ok end) |> expect(:add, length(messages) - 1, fn ^broadcast_events_key -> :ok end) conn = post(conn, Routes.broadcast_path(conn, :broadcast), %{"messages" => messages}) - broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/4) + broadcast_calls = calls(&TenantBroadcaster.pubsub_broadcast/5) Enum.each(messages_to_send, fn %{topic: topic} -> broadcast_topic = Tenants.tenant_topic(tenant, topic, false) @@ -459,7 +455,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do } assert Enum.count(broadcast_calls, fn - [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher] -> true + [_, ^broadcast_topic, ^message, RealtimeChannel.MessageDispatcher, :broadcast] -> true _ -> false end) == 1 end) @@ -472,7 +468,7 @@ defmodule RealtimeWeb.BroadcastControllerTest do @tag role: "anon" test "user without permission won't broadcast", %{conn: conn, db_conn: db_conn, tenant: tenant} do request_events_key = Tenants.requests_per_second_key(tenant) - reject(&TenantBroadcaster.pubsub_broadcast/4) + reject(&TenantBroadcaster.pubsub_broadcast/5) messages = Stream.repeatedly(fn -> generate_message_with_policies(db_conn, tenant) end) diff --git a/test/realtime_web/tenant_broadcaster_test.exs b/test/realtime_web/tenant_broadcaster_test.exs index ddda381a1..bc3b4f90a 100644 --- a/test/realtime_web/tenant_broadcaster_test.exs +++ b/test/realtime_web/tenant_broadcaster_test.exs @@ -60,7 +60,7 @@ defmodule RealtimeWeb.TenantBroadcasterTest do test "pubsub_broadcast", %{node: node} do message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub, :broadcast) assert_receive ^message @@ -71,13 +71,13 @@ defmodule RealtimeWeb.TenantBroadcasterTest do :telemetry, [:realtime, :tenants, :payload, :size], %{size: 114}, - %{tenant: "realtime-dev"} + %{tenant: "realtime-dev", message_type: :broadcast} } end test "pubsub_broadcast list payload", %{node: node} do message = %Broadcast{topic: @topic, event: "an event", payload: ["a", %{"b" => "c"}, 1, 23]} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub, :broadcast) assert_receive ^message @@ -88,13 +88,13 @@ defmodule RealtimeWeb.TenantBroadcasterTest do :telemetry, [:realtime, :tenants, :payload, :size], %{size: 130}, - %{tenant: "realtime-dev"} + %{tenant: "realtime-dev", message_type: :broadcast} } end test "pubsub_broadcast string payload", %{node: node} do message = %Broadcast{topic: @topic, event: "an event", payload: "some text payload"} - TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub) + TenantBroadcaster.pubsub_broadcast("realtime-dev", @topic, message, Phoenix.PubSub, :broadcast) assert_receive ^message @@ -105,7 +105,7 @@ defmodule RealtimeWeb.TenantBroadcasterTest do :telemetry, [:realtime, :tenants, :payload, :size], %{size: 119}, - %{tenant: "realtime-dev"} + %{tenant: "realtime-dev", message_type: :broadcast} } end end @@ -131,7 +131,7 @@ defmodule RealtimeWeb.TenantBroadcasterTest do message = %Broadcast{topic: @topic, event: "an event", payload: %{"a" => "b"}} - TenantBroadcaster.pubsub_broadcast_from("realtime-dev", self(), @topic, message, Phoenix.PubSub) + TenantBroadcaster.pubsub_broadcast_from("realtime-dev", self(), @topic, message, Phoenix.PubSub, :broadcast) assert_receive {:other_process, ^message} @@ -142,7 +142,7 @@ defmodule RealtimeWeb.TenantBroadcasterTest do :telemetry, [:realtime, :tenants, :payload, :size], %{size: 114}, - %{tenant: "realtime-dev"} + %{tenant: "realtime-dev", message_type: :broadcast} } # This process does not receive the message @@ -151,5 +151,38 @@ defmodule RealtimeWeb.TenantBroadcasterTest do end end + describe "collect_payload_size/3" do + @describetag pubsub_adapter: :gen_rpc + + test "emit telemetry for struct" do + TenantBroadcaster.collect_payload_size( + "realtime-dev", + %Phoenix.Socket.Broadcast{event: "broadcast", payload: %{"a" => "b"}}, + :broadcast + ) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 65}, + %{tenant: "realtime-dev", message_type: :broadcast}} + end + + test "emit telemetry for map" do + TenantBroadcaster.collect_payload_size( + "realtime-dev", + %{event: "broadcast", payload: %{"a" => "b"}}, + :postgres_changes + ) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 53}, + %{tenant: "realtime-dev", message_type: :postgres_changes}} + end + + test "emit telemetry for non-map" do + TenantBroadcaster.collect_payload_size("realtime-dev", "some blob", :presence) + + assert_receive {:telemetry, [:realtime, :tenants, :payload, :size], %{size: 15}, + %{tenant: "realtime-dev", message_type: :presence}} + end + end + def handle_telemetry(event, measures, metadata, pid: pid), do: send(pid, {:telemetry, event, measures, metadata}) end From eb8b91dafb1f2317f8f66ada178d288e1faf13d6 Mon Sep 17 00:00:00 2001 From: "Al @h0lybyte" <5599058+h0lybyte@users.noreply.github.com> Date: Sun, 12 Oct 2025 13:40:43 -0400 Subject: [PATCH 8/8] =?UTF-8?q?=F0=9F=94=84=20Sync=20with=20upstream=20cha?= =?UTF-8?q?nges=20(#15)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: runtime setup error (#1520) * fix: use primary instead of replica on rename_settings_field (#1521) * feat: upgrade cowboy & ranch (#1523) * fix: Fix GenRpc to not try to connect to nodes that are not alive (#1525) * fix: enable presence on track message (#1527) currently the user would need to have enabled from the beginning of the channel. this will enable users to enable presence later in the flow by sending a track message which will enable presence messages for them * fix: set cowboy active_n=100 as cowboy 2.12.0 (#1530) cowboy 2.13.0 set the default active_n=1 * fix: provide error_code metadata on RealtimeChannel.Logging (#1531) * feat: disable UTF8 validation on websocket frames (#1532) Currently all text frames as handled only with JSON which already requires UTF-8 * fix: move DB setup to happen after Connect.init (#1533) This change reduces the impact of slow DB setup impacting other tenants trying to connect at the same time that landed on the same partition * fix: handle wal bloat (#1528) Verify that replication connection is able to reconnect when faced with WAL bloat issues * feat: replay realtime.messages (#1526) A new index was created on inserted_at DESC, topic WHERE private IS TRUE AND extension = "broadast" The hardcoded limit is 25 for now. * feat: gen_rpc pub sub adapter (#1529) Add a PubSub adapter that uses gen_rpc to send messages to other nodes. It uses :gen_rpc.abcast/3 instead of :erlang.send/2 The adapter works very similarly to the PG2 adapter. It consists of multiple workers that forward to the local node using PubSub.local_broadcast. The way to choose the worker to be used is based on the sending process just like PG2 adapter does The number of workers is controlled by `:pool_size` or `:broadcast_pool_size`. This distinction exists because Phoenix.PubSub uses `:pool_size` to define how many partitions the PubSub registry will use. It's possible to control them separately by using `:broadcast_pool_size` * fix: ensure message id doesn't raise on non-map payloads (#1534) * fix: match error on Connect (#1536) --------- Co-authored-by: Eduardo Gurgel Pinho * feat: websocket max heap size configuration (#1538) * fix: set max process heap size to 500MB instead of 8GB * feat: set websocket transport max heap size WEBSOCKET_MAX_HEAP_SIZE can be used to configure it * fix: update gen_rpc to fix gen_rpc_dispatcher issues (#1537) Issues: * Single gen_rpc_dispatcher that can be a bottleneck if the connecting takes some time * Many calls can land on the dispatcher but the node might be gone already. If we don't validate the node it might keep trying to connect until it times out instead of quickly giving up due to not being an actively connected node. * fix: improve ErlSysMon logging for processes (#1540) Include initial_call, ancestors, registered_name, message_queue_len and total_heap_size Also bump long_schedule and long_gc * fix: make pubsub adapter configurable (#1539) * fix: specify that only private channels are allowed when replaying (#1543) messages * fix: rate limit connect module (#1541) On bad connection, we rate limit the Connect module so we prevent abuses and too much logging of errors * build: automatically cancel old tests/build on new push (#1545) Currently, whenever you push any commit to your branch, the old builds are still running and a new build is started. Once a new commit is added, the old test results no longer matter and it's just a waste of CI resources. Also reduces confusion with multiple builds running in parallel for the same branch/possibly blocking any merges. With this little change, we ensure that whenever a new commit is added, the previous build is immediately canceled/stopped and only the build (latest commit) runs. * fix: move message queue data to off-heap for gen_rpc pub sub workers (#1548) * fix: rate limit Connect.lookup_or_start_connection on error only (#1549) * fix: increase connect error rate window to 30 seconds (#1550) * fix: set a lower fullsweep_after flag for GenRpcPubSub workers (#1551) * fix: hardcode presence limit (#1552) * fix: further decrease limit on presence events (#1553) * fix: bump up realtime (#1554) * fix: lower rate limit to 100 events per second (#1556) * fix: move connect rate limit to socket (#1555) * fix: reduce max_frame_size to 5MB * fix: fullsweep_after=100 on gen rpc pub sub workers --------- Co-authored-by: Eduardo Gurgel Pinho * fix: collect global metrics without tenant tagging (#1557) * feat: presence payload size (#1559) * Also tweak buckets to account all the way to 3000KB * Start tagging the payload size metrics with message_type. message_type can be presence, broadcast or postgres_changes * fix: use GenRpc for Realtime.Latency pings (#1560) * Fastlane for phoenix presence_diff (#1558) It uses a fork of Phoenix for time being * fix: count presence_diff events on MessageDispatcher * fix: remove traces from console during development * fix: limit db events (#1562) * chore: split tests and lint workflows (#1564) Also cache mix _build and deps * fix: use LiveView stream for status page (#1565) * fix: use LiveView stream for status page * fix: need full node name on localhost for tests * fix: cleanup * fix: add tests * fix: bump version * fix: cleanup syntax * fix: format * fix: refine join payload checking (#1567) * fix: shard user scopes in syn (#1566) --------- Co-authored-by: Filipe Cabaço Co-authored-by: Eduardo Gurgel Co-authored-by: Kevin Grüneberg Co-authored-by: Chase Granberry Co-authored-by: Bradley Haljendi <5642609+Fudster@users.noreply.github.com> --- .github/workflows/lint.yml | 78 ++++++++++++++++ .github/workflows/tests.yml | 42 +++------ config/runtime.exs | 4 +- .../postgres_cdc_rls/replication_poller.ex | 39 +++++--- lib/realtime/application.ex | 3 +- lib/realtime/nodes.ex | 5 +- lib/realtime/tenants.ex | 28 +++++- lib/realtime/user_counter.ex | 21 ++++- lib/realtime_web/channels/payloads/config.ex | 8 ++ .../channels/payloads/presence.ex | 2 +- lib/realtime_web/live/status_live/index.ex | 31 ++++--- .../live/status_live/index.html.heex | 18 ++-- mix.exs | 2 +- test/integration/rt_channel_test.exs | 7 +- .../extensions/cdc_rls/cdc_rls_test.exs | 90 ++++++++++++++++++- test/realtime/tenants/connect_test.exs | 4 +- .../channels/payloads/join_test.exs | 14 +++ .../live/status_live/index_test.exs | 33 +++++++ 18 files changed, 350 insertions(+), 79 deletions(-) create mode 100644 .github/workflows/lint.yml create mode 100644 test/realtime_web/live/status_live/index_test.exs diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000..b27a4e9f3 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,78 @@ +name: Lint +on: + pull_request: + paths: + - "lib/**" + - "test/**" + - "config/**" + - "priv/**" + - "assets/**" + - "rel/**" + - "mix.exs" + - "Dockerfile" + - "run.sh" + + push: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + tests: + name: Lint + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Setup elixir + id: beam + uses: erlef/setup-beam@v1 + with: + otp-version: 27.x # Define the OTP version [required] + elixir-version: 1.17.x # Define the elixir version [required] + - name: Cache Mix + uses: actions/cache@v4 + with: + path: | + deps + _build + key: ${{ github.workflow }}-${{ runner.os }}-mix-${{ env.elixir }}-${{ env.otp }}-${{ hashFiles('**/mix.lock') }} + restore-keys: | + ${{ github.workflow }}-${{ runner.os }}-mix-${{ env.elixir }}-${{ env.otp }}- + + - name: Install dependencies + run: mix deps.get + - name: Set up Postgres + run: docker compose -f docker-compose.dbs.yml up -d + - name: Run main database migrations + run: mix ecto.migrate --log-migrator-sql + - name: Run database tenant migrations + run: mix ecto.migrate --migrations-path lib/realtime/tenants/repo/migrations + - name: Run format check + run: mix format --check-formatted + - name: Credo checks + run: mix credo + - name: Run hex audit + run: mix hex.audit + - name: Run mix_audit + run: mix deps.audit + - name: Run sobelow + run: mix sobelow --config .sobelow-conf + - name: Retrieve PLT Cache + uses: actions/cache@v4 + id: plt-cache + with: + path: priv/plts + key: ${{ runner.os }}-${{ steps.beam.outputs.otp-version }}-${{ steps.beam.outputs.elixir-version }}-plts-${{ hashFiles(format('{0}{1}', github.workspace, '/mix.lock')) }} + - name: Create PLTs + if: steps.plt-cache.outputs.cache-hit != 'true' + run: | + mkdir -p priv/plts + mix dialyzer.build + - name: Run dialyzer + run: mix dialyzer + - name: Run dev seeds + run: DB_ENC_KEY="1234567890123456" mix ecto.setup diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c9c2a73fa..45d27634a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,6 +20,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + MIX_ENV: test + jobs: tests: name: Tests @@ -36,44 +39,19 @@ jobs: - name: Cache Mix uses: actions/cache@v4 with: - path: deps - key: ${{ runner.os }}-mix-${{ hashFiles(format('{0}{1}', github.workspace, '/mix.lock')) }} + path: | + deps + _build + key: ${{ github.workflow }}-${{ runner.os }}-mix-${{ env.elixir }}-${{ env.otp }}-${{ hashFiles('**/mix.lock') }} restore-keys: | - ${{ runner.os }}-mix- + ${{ github.workflow }}-${{ runner.os }}-mix-${{ env.elixir }}-${{ env.otp }}- + - name: Pull postgres image quietly in background (used by test/support/containers.ex) + run: docker pull supabase/postgres:15.8.1.040 > /dev/null 2>&1 & - name: Install dependencies run: mix deps.get - name: Set up Postgres run: docker compose -f docker-compose.dbs.yml up -d - - name: Run main database migrations - run: mix ecto.migrate --log-migrator-sql - - name: Run database tenant migrations - run: mix ecto.migrate --migrations-path lib/realtime/tenants/repo/migrations - - name: Run format check - run: mix format --check-formatted - - name: Credo checks - run: mix credo - - name: Run hex audit - run: mix hex.audit - - name: Run mix_audit - run: mix deps.audit - - name: Run sobelow - run: mix sobelow --config .sobelow-conf - - name: Retrieve PLT Cache - uses: actions/cache@v4 - id: plt-cache - with: - path: priv/plts - key: ${{ runner.os }}-${{ steps.beam.outputs.otp-version }}-${{ steps.beam.outputs.elixir-version }}-plts-${{ hashFiles(format('{0}{1}', github.workspace, '/mix.lock')) }} - - name: Create PLTs - if: steps.plt-cache.outputs.cache-hit != 'true' - run: | - mkdir -p priv/plts - mix dialyzer.build - - name: Run dialyzer - run: mix dialyzer - - name: Run dev seeds - run: DB_ENC_KEY="1234567890123456" mix ecto.setup - name: Start epmd run: epmd -daemon - name: Run tests diff --git a/config/runtime.exs b/config/runtime.exs index 447934b65..f09d22846 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -70,6 +70,7 @@ platform = if System.get_env("AWS_EXECUTION_ENV") == "AWS_ECS_FARGATE", do: :aws broadcast_pool_size = Env.get_integer("BROADCAST_POOL_SIZE", 10) pubsub_adapter = System.get_env("PUBSUB_ADAPTER", "gen_rpc") |> String.to_atom() websocket_max_heap_size = div(Env.get_integer("WEBSOCKET_MAX_HEAP_SIZE", 50_000_000), :erlang.system_info(:wordsize)) +users_scope_shards = Env.get_integer("USERS_SCOPE_SHARDS", 5) no_channel_timeout_in_ms = if config_env() == :test, @@ -126,7 +127,8 @@ config :realtime, no_channel_timeout_in_ms: no_channel_timeout_in_ms, platform: platform, pubsub_adapter: pubsub_adapter, - broadcast_pool_size: broadcast_pool_size + broadcast_pool_size: broadcast_pool_size, + users_scope_shards: users_scope_shards if config_env() != :test && run_janitor? do config :realtime, diff --git a/lib/extensions/postgres_cdc_rls/replication_poller.ex b/lib/extensions/postgres_cdc_rls/replication_poller.ex index 85466ebe9..34697572c 100644 --- a/lib/extensions/postgres_cdc_rls/replication_poller.ex +++ b/lib/extensions/postgres_cdc_rls/replication_poller.ex @@ -18,6 +18,8 @@ defmodule Extensions.PostgresCdcRls.ReplicationPoller do alias Realtime.Adapters.Changes.NewRecord alias Realtime.Adapters.Changes.UpdatedRecord alias Realtime.Database + alias Realtime.RateCounter + alias Realtime.Tenants def start_link(opts), do: GenServer.start_link(__MODULE__, opts) @@ -26,6 +28,12 @@ defmodule Extensions.PostgresCdcRls.ReplicationPoller do tenant_id = args["id"] Logger.metadata(external_id: tenant_id, project: tenant_id) + %Realtime.Api.Tenant{} = Tenants.Cache.get_tenant_by_external_id(tenant_id) + + rate_counter_args = Tenants.db_events_per_second_rate(tenant_id, 4000) + + RateCounter.new(rate_counter_args) + state = %{ backoff: Backoff.new(backoff_min: 100, backoff_max: 5_000, backoff_type: :rand_exp), db_host: args["db_host"], @@ -41,7 +49,8 @@ defmodule Extensions.PostgresCdcRls.ReplicationPoller do retry_ref: nil, retry_count: 0, slot_name: args["slot_name"] <> slot_name_suffix(), - tenant_id: tenant_id + tenant_id: tenant_id, + rate_counter_args: rate_counter_args } {:ok, _} = Registry.register(__MODULE__.Registry, tenant_id, %{}) @@ -74,7 +83,8 @@ defmodule Extensions.PostgresCdcRls.ReplicationPoller do max_record_bytes: max_record_bytes, max_changes: max_changes, conn: conn, - tenant_id: tenant_id + tenant_id: tenant_id, + rate_counter_args: rate_counter_args } = state ) do cancel_timer(poll_ref) @@ -84,7 +94,7 @@ defmodule Extensions.PostgresCdcRls.ReplicationPoller do {time, list_changes} = :timer.tc(Replications, :list_changes, args) record_list_changes_telemetry(time, tenant_id) - case handle_list_changes_result(list_changes, tenant_id) do + case handle_list_changes_result(list_changes, tenant_id, rate_counter_args) do {:ok, row_count} -> Backoff.reset(backoff) @@ -177,20 +187,29 @@ defmodule Extensions.PostgresCdcRls.ReplicationPoller do rows: [_ | _] = rows, num_rows: rows_count }}, - tenant_id + tenant_id, + rate_counter_args ) do - for row <- rows, - change <- columns |> Enum.zip(row) |> generate_record() |> List.wrap() do - topic = "realtime:postgres:" <> tenant_id + case RateCounter.get(rate_counter_args) do + {:ok, %{limit: %{triggered: true}}} -> + :ok - RealtimeWeb.TenantBroadcaster.pubsub_broadcast(tenant_id, topic, change, MessageDispatcher, :postgres_changes) + _ -> + Realtime.GenCounter.add(rate_counter_args.id, rows_count) + + for row <- rows, + change <- columns |> Enum.zip(row) |> generate_record() |> List.wrap() do + topic = "realtime:postgres:" <> tenant_id + + RealtimeWeb.TenantBroadcaster.pubsub_broadcast(tenant_id, topic, change, MessageDispatcher, :postgres_changes) + end end {:ok, rows_count} end - defp handle_list_changes_result({:ok, _}, _), do: {:ok, 0} - defp handle_list_changes_result({:error, reason}, _), do: {:error, reason} + defp handle_list_changes_result({:ok, _}, _, _), do: {:ok, 0} + defp handle_list_changes_result({:error, reason}, _, _), do: {:error, reason} def generate_record([ {"wal", diff --git a/lib/realtime/application.ex b/lib/realtime/application.ex index 99096edfb..45cc0271e 100644 --- a/lib/realtime/application.ex +++ b/lib/realtime/application.ex @@ -46,8 +46,7 @@ defmodule Realtime.Application do Realtime.PromEx.set_metrics_tags() :ets.new(Realtime.Tenants.Connect, [:named_table, :set, :public]) :syn.set_event_handler(Realtime.SynHandler) - - :ok = :syn.add_node_to_scopes([:users, RegionNodes, Realtime.Tenants.Connect]) + :ok = :syn.add_node_to_scopes([RegionNodes, Realtime.Tenants.Connect | Realtime.UsersCounter.scopes()]) region = Application.get_env(:realtime, :region) :syn.join(RegionNodes, region, self(), node: node()) diff --git a/lib/realtime/nodes.ex b/lib/realtime/nodes.ex index ae237eb5f..34c9f3cfb 100644 --- a/lib/realtime/nodes.ex +++ b/lib/realtime/nodes.ex @@ -105,7 +105,7 @@ defmodule Realtime.Nodes do iex> node = :"pink@127.0.0.1" iex> Realtime.Helpers.short_node_id_from_name(node) - "127.0.0.1" + "pink@127.0.0.1" iex> node = :"pink@10.0.1.1" iex> Realtime.Helpers.short_node_id_from_name(node) @@ -124,6 +124,9 @@ defmodule Realtime.Nodes do [_, _, _, _, _, one, two, _] -> one <> two + ["127.0.0.1"] -> + Atom.to_string(name) + _other -> host end diff --git a/lib/realtime/tenants.ex b/lib/realtime/tenants.ex index efd2397ac..9e53e18f1 100644 --- a/lib/realtime/tenants.ex +++ b/lib/realtime/tenants.ex @@ -21,7 +21,8 @@ defmodule Realtime.Tenants do """ @spec list_connected_tenants(atom()) :: [String.t()] def list_connected_tenants(node) do - :syn.group_names(:users, node) + UsersCounter.scopes() + |> Enum.flat_map(fn scope -> :syn.group_names(scope, node) end) end @doc """ @@ -247,6 +248,31 @@ defmodule Realtime.Tenants do %RateCounter.Args{id: db_events_per_second_key(tenant_id), opts: opts} end + @doc "RateCounter arguments for counting database events per second with a limit." + @spec db_events_per_second_rate(String.t(), non_neg_integer) :: RateCounter.Args.t() + def db_events_per_second_rate(tenant_id, max_events_per_second) when is_binary(tenant_id) do + opts = [ + telemetry: %{ + event_name: [:channel, :db_events], + measurements: %{}, + metadata: %{tenant: tenant_id} + }, + limit: [ + value: max_events_per_second, + measurement: :avg, + log: true, + log_fn: fn -> + Logger.error("MessagePerSecondRateLimitReached: Too many postgres changes messages per second", + external_id: tenant_id, + project: tenant_id + ) + end + ] + ] + + %RateCounter.Args{id: db_events_per_second_key(tenant_id), opts: opts} + end + @doc """ The GenCounter key to use when counting events for RealtimeChannel events. iex> Realtime.Tenants.db_events_per_second_key("tenant_id") diff --git a/lib/realtime/user_counter.ex b/lib/realtime/user_counter.ex index 6190030d9..9ea38c780 100644 --- a/lib/realtime/user_counter.ex +++ b/lib/realtime/user_counter.ex @@ -8,17 +8,32 @@ defmodule Realtime.UsersCounter do Adds a RealtimeChannel pid to the `:users` scope for a tenant so we can keep track of all connected clients for a tenant. """ @spec add(pid(), String.t()) :: :ok - def add(pid, tenant), do: :syn.join(:users, tenant, pid) + def add(pid, tenant_id), do: tenant_id |> scope() |> :syn.join(tenant_id, pid) @doc """ Returns the count of all connected clients for a tenant for the cluster. """ @spec tenant_users(String.t()) :: non_neg_integer() - def tenant_users(tenant), do: :syn.member_count(:users, tenant) + def tenant_users(tenant_id), do: tenant_id |> scope() |> :syn.member_count(tenant_id) @doc """ Returns the count of all connected clients for a tenant for a single node. """ @spec tenant_users(atom, String.t()) :: non_neg_integer() - def tenant_users(node_name, tenant), do: :syn.member_count(:users, tenant, node_name) + def tenant_users(node_name, tenant_id), do: tenant_id |> scope() |> :syn.member_count(tenant_id, node_name) + + @doc """ + Returns the scope for a given tenant id. + """ + @spec scope(String.t()) :: atom() + def scope(tenant_id) do + shards = Application.get_env(:realtime, :users_scope_shards) + shard = :erlang.phash2(tenant_id, shards) + :"users_#{shard}" + end + + def scopes() do + shards = Application.get_env(:realtime, :users_scope_shards) + Enum.map(0..(shards - 1), fn shard -> :"users_#{shard}" end) + end end diff --git a/lib/realtime_web/channels/payloads/config.ex b/lib/realtime_web/channels/payloads/config.ex index 923020174..029aa93b5 100644 --- a/lib/realtime_web/channels/payloads/config.ex +++ b/lib/realtime_web/channels/payloads/config.ex @@ -17,6 +17,14 @@ defmodule RealtimeWeb.Channels.Payloads.Config do end def changeset(config, attrs) do + attrs = + attrs + |> Enum.map(fn + {k, v} when is_list(v) -> {k, Enum.filter(v, fn v -> v != nil end)} + {k, v} -> {k, v} + end) + |> Map.new() + config |> cast(attrs, [:private], message: &Join.error_message/2) |> cast_embed(:broadcast, invalid_message: "unable to parse, expected a map") diff --git a/lib/realtime_web/channels/payloads/presence.ex b/lib/realtime_web/channels/payloads/presence.ex index 53e09047d..785df9222 100644 --- a/lib/realtime_web/channels/payloads/presence.ex +++ b/lib/realtime_web/channels/payloads/presence.ex @@ -8,7 +8,7 @@ defmodule RealtimeWeb.Channels.Payloads.Presence do embedded_schema do field :enabled, :boolean, default: true - field :key, :string, default: UUID.uuid1() + field :key, :any, default: UUID.uuid1(), virtual: true end def changeset(presence, attrs) do diff --git a/lib/realtime_web/live/status_live/index.ex b/lib/realtime_web/live/status_live/index.ex index 8a2d32054..f55eddfa5 100644 --- a/lib/realtime_web/live/status_live/index.ex +++ b/lib/realtime_web/live/status_live/index.ex @@ -3,11 +3,18 @@ defmodule RealtimeWeb.StatusLive.Index do alias Realtime.Latency.Payload alias Realtime.Nodes + alias RealtimeWeb.Endpoint @impl true def mount(_params, _session, socket) do - if connected?(socket), do: RealtimeWeb.Endpoint.subscribe("admin:cluster") - {:ok, assign(socket, pings: default_pings(), nodes: Enum.count(all_nodes()))} + if connected?(socket), do: Endpoint.subscribe("admin:cluster") + + socket = + socket + |> assign(nodes: Enum.count(all_nodes())) + |> stream(:pings, default_pings()) + + {:ok, socket} end @impl true @@ -17,17 +24,14 @@ defmodule RealtimeWeb.StatusLive.Index do @impl true def handle_info(%Phoenix.Socket.Broadcast{payload: %Payload{} = payload}, socket) do - pair = payload.from_node <> "_" <> payload.node - payload = %{pair => payload} - - pings = Map.merge(socket.assigns.pings, payload) + pair = pair_id(payload.from_node, payload.node) - {:noreply, assign(socket, pings: pings)} + {:noreply, stream(socket, :pings, [%{id: pair, payload: payload}])} end defp apply_action(socket, :index, _params) do socket - |> assign(:page_title, "Status - Supabase Realtime") + |> assign(:page_title, "Realtime Status") end defp all_nodes do @@ -35,9 +39,14 @@ defmodule RealtimeWeb.StatusLive.Index do end defp default_pings do - for n <- all_nodes(), f <- all_nodes(), into: %{} do - pair = n <> "_" <> f - {pair, %Payload{from_node: f, latency: "Loading...", node: n, timestamp: "Loading..."}} + for n <- all_nodes(), f <- all_nodes() do + pair = pair_id(f, n) + + %{id: pair, payload: %Payload{from_node: f, latency: "Loading...", node: n, timestamp: "Loading..."}} end end + + defp pair_id(from, to) do + from <> "_" <> to + end end diff --git a/lib/realtime_web/live/status_live/index.html.heex b/lib/realtime_web/live/status_live/index.html.heex index 645001714..63ea4fc0d 100644 --- a/lib/realtime_web/live/status_live/index.html.heex +++ b/lib/realtime_web/live/status_live/index.html.heex @@ -1,16 +1,16 @@ <.h1>Supabase Realtime: Multiplayer Edition + <.h2>Cluster Status +

Understand the latency between nodes across the Realtime cluster.

-
- <%= for {_pair, p} <- @pings do %> -
-
From: <%= p.from_region %> - <%= p.from_node %>
-
To: <%= p.region %> - <%= p.node %>
-
<%= p.latency %> ms
-
<%= p.timestamp %>
-
- <% end %> +
+
+
From: <%= p.payload.from_region %> - <%= p.payload.from_node %>
+
To: <%= p.payload.region %> - <%= p.payload.node %>
+
<%= p.payload.latency %> ms
+
<%= p.payload.timestamp %>
+
diff --git a/mix.exs b/mix.exs index d0e42bf11..e98ac608f 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.53.0", + version: "2.53.4", elixir: "~> 1.17.3", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/integration/rt_channel_test.exs b/test/integration/rt_channel_test.exs index 23b1a3a7f..8f4607b01 100644 --- a/test/integration/rt_channel_test.exs +++ b/test/integration/rt_channel_test.exs @@ -1159,7 +1159,7 @@ defmodule Realtime.Integration.RtChannelTest do } end) - assert log =~ "ChannelShutdown: Token has expired 1000 seconds ago" + assert log =~ "ChannelShutdown: Token has expired" end test "ChannelShutdown include sub if available in jwt claims", %{tenant: tenant, topic: topic} do @@ -2240,7 +2240,8 @@ defmodule Realtime.Integration.RtChannelTest do # 0 events as no broadcast used assert 2 = get_count([:realtime, :rate_counter, :channel, :joins], external_id) assert 2 = get_count([:realtime, :rate_counter, :channel, :presence_events], external_id) - assert 10 = get_count([:realtime, :rate_counter, :channel, :db_events], external_id) + # 5 + 5 + 5 (5 for each websocket and 5 while publishing) + assert 15 = get_count([:realtime, :rate_counter, :channel, :db_events], external_id) assert 0 = get_count([:realtime, :rate_counter, :channel, :events], external_id) end @@ -2587,7 +2588,7 @@ defmodule Realtime.Integration.RtChannelTest do Realtime.Tenants.Cache.invalidate_tenant_cache(external_id) end - defp assert_process_down(pid, timeout \\ 100) do + defp assert_process_down(pid, timeout \\ 300) do ref = Process.monitor(pid) assert_receive {:DOWN, ^ref, :process, ^pid, _reason}, timeout end diff --git a/test/realtime/extensions/cdc_rls/cdc_rls_test.exs b/test/realtime/extensions/cdc_rls/cdc_rls_test.exs index d12c0ba73..704346aab 100644 --- a/test/realtime/extensions/cdc_rls/cdc_rls_test.exs +++ b/test/realtime/extensions/cdc_rls/cdc_rls_test.exs @@ -235,6 +235,16 @@ defmodule Realtime.Extensions.CdcRlsTest do end) RateCounter.stop(tenant.external_id) + on_exit(fn -> RateCounter.stop(tenant.external_id) end) + + on_exit(fn -> :telemetry.detach(__MODULE__) end) + + :telemetry.attach( + __MODULE__, + [:realtime, :tenants, :payload, :size], + &__MODULE__.handle_telemetry/4, + pid: self() + ) on_exit(fn -> :telemetry.detach(__MODULE__) end) @@ -324,8 +334,11 @@ defmodule Realtime.Extensions.CdcRlsTest do rate = Realtime.Tenants.db_events_per_second_rate(tenant) - assert {:ok, %RateCounter{id: {:channel, :db_events, "dev_tenant"}, bucket: bucket}} = RateCounter.get(rate) - assert 1 in bucket + assert {:ok, %RateCounter{id: {:channel, :db_events, "dev_tenant"}, bucket: bucket}} = + RateCounter.get(rate) + + # 1 from ReplicationPoller and 1 from MessageDispatcher + assert Enum.sum(bucket) == 2 assert_receive { :telemetry, @@ -335,6 +348,79 @@ defmodule Realtime.Extensions.CdcRlsTest do } end + test "rate limit works", %{tenant: tenant, conn: conn} do + on_exit(fn -> PostgresCdcRls.handle_stop(tenant.external_id, 10_000) end) + + %Tenant{extensions: extensions, external_id: external_id} = tenant + postgres_extension = PostgresCdc.filter_settings("postgres_cdc_rls", extensions) + args = Map.put(postgres_extension, "id", external_id) + + pg_change_params = [ + %{ + id: UUID.uuid1(), + params: %{"event" => "*", "schema" => "public"}, + channel_pid: self(), + claims: %{ + "exp" => System.system_time(:second) + 100_000, + "iat" => 0, + "ref" => "127.0.0.1", + "role" => "anon" + } + } + ] + + ids = + Enum.map(pg_change_params, fn %{id: id, params: params} -> + {UUID.string_to_binary!(id), :erlang.phash2(params)} + end) + + topic = "realtime:test" + serializer = Phoenix.Socket.V1.JSONSerializer + + subscription_metadata = {:subscriber_fastlane, self(), serializer, ids, topic, external_id, true} + metadata = [metadata: subscription_metadata] + :ok = PostgresCdc.subscribe(PostgresCdcRls, pg_change_params, external_id, metadata) + + # First time it will return nil + PostgresCdcRls.handle_connect(args) + # Wait for it to start + Process.sleep(3000) + {:ok, response} = PostgresCdcRls.handle_connect(args) + + # Now subscribe to the Postgres Changes + {:ok, _} = PostgresCdcRls.handle_after_connect(response, postgres_extension, pg_change_params) + assert %Postgrex.Result{rows: [[1]]} = Postgrex.query!(conn, "select count(*) from realtime.subscription", []) + + log = + capture_log(fn -> + # increment artifically the counter to reach the limit + tenant.external_id + |> Realtime.Tenants.db_events_per_second_key() + |> Realtime.GenCounter.add(100_000_000) + + # Wait for RateCounter to update + Process.sleep(1500) + end) + + assert log =~ "MessagePerSecondRateLimitReached: Too many postgres changes messages per second" + + # Insert a record + %{rows: [[_id]]} = Postgrex.query!(conn, "insert into test (details) values ('test') returning id", []) + + refute_receive {:socket_push, :text, _}, 5000 + + # Wait for RateCounter to update + Process.sleep(2000) + + rate = Realtime.Tenants.db_events_per_second_rate(tenant) + + assert {:ok, %RateCounter{id: {:channel, :db_events, "dev_tenant"}, bucket: bucket, limit: %{triggered: true}}} = + RateCounter.get(rate) + + # Nothing has changed + assert Enum.sum(bucket) == 100_000_000 + end + @aux_mod (quote do defmodule Subscriber do # Start CDC remotely diff --git a/test/realtime/tenants/connect_test.exs b/test/realtime/tenants/connect_test.exs index 741f6ecf7..804b3018f 100644 --- a/test/realtime/tenants/connect_test.exs +++ b/test/realtime/tenants/connect_test.exs @@ -328,9 +328,9 @@ defmodule Realtime.Tenants.ConnectTest do region = Tenants.region(tenant) assert {_pid, %{conn: ^db_conn, region: ^region}} = :syn.lookup(Connect, external_id) Process.sleep(1000) - :syn.leave(:users, external_id, self()) + external_id |> UsersCounter.scope() |> :syn.leave(external_id, self()) Process.sleep(1000) - assert :undefined = :syn.lookup(Connect, external_id) + assert :undefined = external_id |> UsersCounter.scope() |> :syn.lookup(external_id) refute Process.alive?(db_conn) Connect.shutdown(external_id) end diff --git a/test/realtime_web/channels/payloads/join_test.exs b/test/realtime_web/channels/payloads/join_test.exs index c1ea54a67..f02c2a73d 100644 --- a/test/realtime_web/channels/payloads/join_test.exs +++ b/test/realtime_web/channels/payloads/join_test.exs @@ -58,6 +58,14 @@ defmodule RealtimeWeb.Channels.Payloads.JoinTest do assert is_binary(key) end + test "presence key can be number" do + config = %{"config" => %{"presence" => %{"enabled" => true, "key" => 123}}} + + assert {:ok, %Join{config: %Config{presence: %Presence{key: key}}}} = Join.validate(config) + + assert key == 123 + end + test "invalid replay" do config = %{"config" => %{"broadcast" => %{"replay" => 123}}} @@ -105,5 +113,11 @@ defmodule RealtimeWeb.Channels.Payloads.JoinTest do user_token: ["unable to parse, expected string"] } end + + test "handles postgres changes with nil value in array as empty array" do + config = %{"config" => %{"postgres_changes" => [nil]}} + + assert {:ok, %Join{config: %Config{postgres_changes: []}}} = Join.validate(config) + end end end diff --git a/test/realtime_web/live/status_live/index_test.exs b/test/realtime_web/live/status_live/index_test.exs new file mode 100644 index 000000000..ae3af0ad0 --- /dev/null +++ b/test/realtime_web/live/status_live/index_test.exs @@ -0,0 +1,33 @@ +defmodule RealtimeWeb.StatusLive.IndexTest do + use RealtimeWeb.ConnCase + import Phoenix.LiveViewTest + + alias Realtime.Latency.Payload + alias Realtime.Nodes + alias RealtimeWeb.Endpoint + + describe "Status LiveView" do + test "renders status page", %{conn: conn} do + {:ok, _view, html} = live(conn, ~p"/status") + + assert html =~ "Realtime Status" + end + + test "receives broadcast from PubSub", %{conn: conn} do + {:ok, view, _html} = live(conn, ~p"/status") + + payload = %Payload{ + from_node: Nodes.short_node_id_from_name(:"pink@127.0.0.1"), + node: Nodes.short_node_id_from_name(:"orange@127.0.0.1"), + latency: "42ms", + timestamp: DateTime.utc_now() + } + + Endpoint.broadcast("admin:cluster", "ping", payload) + + html = render(view) + assert html =~ "42ms" + assert html =~ "pink@127.0.0.1_orange@127.0.0.1" + end + end +end