fix: Handle in-flight requests during stack shutdown more gracefully (#3310)

msfstef · web-flow · commit 14ce221ee332 · 2025-10-21T10:58:34.000+03:00
Closes #3300 @balegas this error was a symptom of what we have discussed about races during shutdown - we have requests in-flight during the shutdown that will start erroring as resources are no longer present. I have looked into the logs of the occurrences and have pinpointed two causes and added handlers for them: - After a live request timeout, we check the global last processed LSN to attach to an up to date message - but if during that 20 second wait the stack goes down (and is still down by the end of the 20 seconds), it fails either with table not present or no value for the LSN - I had originally fixed this by doing another `hold_until_stack_ready` after a live timeout, but @alco had removed that in his PR that moved the shape status out of the connection subsystem - however this is still an issue in how we do multitenancy/cloud so I'm putting it back in without a timeout (no longer wakes connections either) - Added a test to cover it as the test that was there was not actually doing the right thing - An in-flight request might reach `await_snapshot_start`, which if it finds a shape but not shape process, it enters a retry loop as it expects the process to exist shortly - which is not the case if the stack is shutting down and eventually errors with tables not found - I am now catching argument errors/tables not existing and returning a "prettier" error that will be turned into a 500 but without a Sentry error - ultimately this is an issue we have to deal with for what to do with in-flight requests
diff --git a/.changeset/metal-peaches-change.md b/.changeset/metal-peaches-change.md
@@ -0,0 +1,5 @@
+---
+'@core/sync-service': patch
+---
+
+Handle requests during stack shutdown more gracefully
diff --git a/packages/sync-service/lib/electric/lsn_tracker.ex b/packages/sync-service/lib/electric/lsn_tracker.ex
@@ -19,6 +19,8 @@ defmodule Electric.LsnTracker do
     stack_id
     |> table()
     |> :ets.insert({:last_processed_lsn, lsn})
+
+    :ok
   end
 
   def set_last_processed_lsn(lsn, stack_id) when is_integer(lsn) do
diff --git a/packages/sync-service/lib/electric/shape_cache.ex b/packages/sync-service/lib/electric/shape_cache.ex
@@ -161,10 +161,11 @@ defmodule Electric.ShapeCache do
         {:error, :unknown}
 
       true ->
-        server = Electric.Shapes.Consumer.name(stack_id, shape_handle)
-
         try do
-          GenServer.call(server, :await_snapshot_start, 15_000)
+          Electric.Shapes.Consumer.await_snapshot_start(
+            %{stack_id: stack_id, shape_handle: shape_handle},
+            15_000
+          )
         catch
           :exit, {:timeout, {GenServer, :call, _}} ->
             # Please note that :await_snapshot_start can also return a timeout error as well
@@ -181,6 +182,9 @@ defmodule Electric.ShapeCache do
             await_snapshot_start(shape_handle, opts)
         end
     end
+  rescue
+    ArgumentError ->
+      {:error, %RuntimeError{message: "Shape meta tables not found"}}
   end
 
   @impl Electric.ShapeCacheBehaviour
diff --git a/packages/sync-service/lib/electric/shape_cache/shape_status.ex b/packages/sync-service/lib/electric/shape_cache/shape_status.ex
@@ -20,7 +20,8 @@ defmodule Electric.ShapeCache.ShapeStatusBehaviour do
     @type stack_ref() :: atom() | stack_id() | [stack_id: stack_id()]
   end
 
-  @callback initialize_from_storage(stack_ref(), Storage.t()) :: :ok | {:error, term()}
+  @callback initialize_from_storage(stack_ref(), Electric.ShapeCache.Storage.storage()) ::
+              :ok | {:error, term()}
   @callback terminate(stack_ref(), String.t()) :: :ok | {:error, term()}
   @callback list_shapes(stack_ref()) :: [{shape_handle(), Shape.t()}]
   @callback count_shapes(stack_ref()) :: non_neg_integer()
@@ -521,10 +522,7 @@ defmodule Electric.ShapeCache.ShapeStatus do
   end
 
   defp backup_file_path(backup_dir) do
-    case backup_dir do
-      nil -> nil
-      dir -> dir |> Path.join(@backup_file) |> String.to_charlist()
-    end
+    backup_dir |> Path.join(@backup_file) |> String.to_charlist()
   end
 
   def backup_dir(storage) do
diff --git a/packages/sync-service/lib/electric/shapes/api.ex b/packages/sync-service/lib/electric/shapes/api.ex
@@ -366,7 +366,7 @@ defmodule Electric.Shapes.Api do
 
   defp hold_until_stack_ready(%Api{} = api, opts \\ []) do
     stack_id = stack_id(api)
-    opts = [timeout: api.stack_ready_timeout] ++ opts
+    opts = Keyword.put_new(opts, :timeout, api.stack_ready_timeout)
 
     case Electric.StatusMonitor.wait_until_active(stack_id, opts) do
       :ok ->
@@ -384,7 +384,7 @@ defmodule Electric.Shapes.Api do
         hold_until_stack_ready(api, block_on_conn_sleeping: true)
 
       {:error, message} ->
-        Logger.warning("Stack not ready after #{api.stack_ready_timeout}ms. Reason: #{message}")
+        Logger.warning("Stack not ready after #{opts[:timeout]}ms. Reason: #{message}")
         {:error, Response.error(api, message, status: 503)}
     end
   end
@@ -746,7 +746,7 @@ defmodule Electric.Shapes.Api do
     %{
       new_changes_ref: ref,
       handle: shape_handle,
-      api: %{long_poll_timeout: long_poll_timeout}
+      api: %{long_poll_timeout: long_poll_timeout} = api
     } = request
 
     Logger.debug("Client #{inspect(self())} is waiting for changes to #{shape_handle}")
@@ -772,12 +772,21 @@ defmodule Electric.Shapes.Api do
         error = Api.Error.must_refetch()
         Response.error(request, error.message, status: error.status)
     after
-      # If we timeout, return an up-to-date message
+      # If we timeout, check that the stack is still up and
+      # return an up-to-date message
       long_poll_timeout ->
-        request
-        |> update_attrs(%{ot_is_long_poll_timeout: true})
-        |> determine_global_last_seen_lsn()
-        |> no_change_response()
+        request = update_attrs(request, %{ot_is_long_poll_timeout: true})
+
+        case Electric.StatusMonitor.status(api.stack_id) do
+          %{shape: :up} ->
+            request
+            |> determine_global_last_seen_lsn()
+            |> no_change_response()
+
+          _ ->
+            message = Electric.StatusMonitor.timeout_message(api.stack_id)
+            Response.error(request, message, status: 503)
+        end
     end
   end
 
@@ -821,7 +830,7 @@ defmodule Electric.Shapes.Api do
 
     response = %{request.response | chunked: true, body: sse_event_stream}
 
-    %{response | trace_attrs: Map.put(response.trace_attrs || %{}, :ot_is_sse_response, true)}
+    %{response | trace_attrs: Map.put(response.trace_attrs, :ot_is_sse_response, true)}
   end
 
   defp next_sse_event(%SseState{mode: :receive} = state) do
diff --git a/packages/sync-service/lib/electric/shapes/consumer.ex b/packages/sync-service/lib/electric/shapes/consumer.ex
@@ -34,12 +34,16 @@ defmodule Electric.Shapes.Consumer do
     GenServer.call(consumer, :initial_state, 30_000)
   end
 
-  def await_snapshot_start(consumer) when is_pid(consumer) do
-    GenServer.call(consumer, :await_snapshot_start, 30_000)
+  @spec await_snapshot_start(pid() | map()) :: :started | {:error, any()}
+  @spec await_snapshot_start(pid() | map(), timeout()) :: :started | {:error, any()}
+  def await_snapshot_start(consumer, timeout \\ 30_000)
+
+  def await_snapshot_start(consumer, timeout) when is_pid(consumer) do
+    GenServer.call(consumer, :await_snapshot_start, timeout)
   end
 
-  def await_snapshot_start(consumer) do
-    GenServer.call(name(consumer), :await_snapshot_start, 30_000)
+  def await_snapshot_start(consumer, timeout) do
+    GenServer.call(name(consumer), :await_snapshot_start, timeout)
   end
 
   def subscribe_materializer(consumer) do
diff --git a/packages/sync-service/test/electric/shape_cache_test.exs b/packages/sync-service/test/electric/shape_cache_test.exs
@@ -822,6 +822,76 @@ defmodule Electric.ShapeCacheTest do
                "Shape terminated before snapshot was ready"
              ]
     end
+
+    test "should wait for consumer to come up", ctx do
+      Support.TestUtils.patch_snapshotter(fn parent, shape_handle, _, _ ->
+        GenServer.cast(parent, {:pg_snapshot_known, shape_handle, @pg_snapshot_xmin_100})
+        GenServer.cast(parent, {:snapshot_started, shape_handle})
+      end)
+
+      %{shape_cache_opts: opts} = with_shape_cache(ctx)
+
+      start_consumer_delay = 500
+
+      test_pid = self()
+
+      Repatch.patch(
+        Electric.Shapes.DynamicConsumerSupervisor,
+        :start_shape_consumer,
+        [mode: :shared],
+        fn a, b ->
+          send(test_pid, :about_to_start_consumer)
+
+          Process.sleep(start_consumer_delay)
+          Repatch.real(Electric.Shapes.DynamicConsumerSupervisor.start_shape_consumer(a, b))
+        end
+      )
+
+      Repatch.allow(self(), opts[:server])
+
+      creation_task = Task.async(fn -> ShapeCache.get_or_create_shape_handle(@shape, opts) end)
+
+      {shape_handle, _} =
+        receive do
+          :about_to_start_consumer -> ShapeCache.get_or_create_shape_handle(@shape, opts)
+        end
+
+      wait_task = Task.async(fn -> ShapeCache.await_snapshot_start(shape_handle, opts) end)
+
+      # should delay in responding
+      refute Task.yield(wait_task, 10)
+      Task.await(creation_task)
+      assert :started = Task.await(wait_task, start_consumer_delay)
+    end
+
+    test "should stop waiting for consumer to come up if shape tables missing", ctx do
+      Support.TestUtils.patch_snapshotter(fn _, _, _, _ -> nil end)
+      %{shape_cache_opts: opts} = with_shape_cache(ctx)
+
+      Repatch.patch(
+        Electric.Shapes.DynamicConsumerSupervisor,
+        :start_shape_consumer,
+        [mode: :shared],
+        fn _, _ -> Process.sleep(:infinity) end
+      )
+
+      Repatch.allow(self(), opts[:server])
+
+      start_supervised({Task, fn -> ShapeCache.get_or_create_shape_handle(@shape, opts) end})
+
+      Process.sleep(10)
+
+      {shape_handle, _} = ShapeCache.get_or_create_shape_handle(@shape, opts)
+
+      wait_task = Task.async(fn -> ShapeCache.await_snapshot_start(shape_handle, opts) end)
+
+      # should delay in responding
+      refute Task.yield(wait_task, 10)
+      stop_supervised(ctx[:shape_status_owner])
+
+      assert {:error, %RuntimeError{message: "Shape meta tables not found"}} =
+               Task.await(wait_task, 500)
+    end
   end
 
   describe "after restart" do
diff --git a/packages/sync-service/test/electric/shapes/api_test.exs b/packages/sync-service/test/electric/shapes/api_test.exs
@@ -1,5 +1,6 @@
 defmodule Electric.Shapes.ApiTest do
   use ExUnit.Case, async: true
+  use Repatch.ExUnit
   use Support.Mock
 
   alias Electric.Postgres.Lsn
@@ -1194,29 +1195,38 @@ defmodule Electric.Shapes.ApiTest do
         []
       end)
 
-      lsn = Lsn.from_integer(:rand.uniform(1_000_000))
-      Electric.LsnTracker.set_last_processed_lsn(lsn, ctx.stack_id)
+      stack_id = ctx.stack_id
 
-      assert {:ok, request} =
-               Api.validate(
-                 ctx.api,
-                 %{
-                   table: "public.users",
-                   offset: "#{@test_offset}",
-                   handle: @test_shape_handle,
-                   live: true
-                 }
-               )
+      status_task =
+        start_supervised!({
+          Task,
+          fn ->
+            set_status_to_active(ctx)
+            Process.sleep(:infinity)
+          end
+        })
 
-      response = Api.serve_shape_response(request)
-      assert response.status == 200
+      req_task =
+        Task.async(fn ->
+          assert {:ok, request} =
+                   Api.validate(
+                     ctx.api,
+                     %{
+                       table: "public.users",
+                       offset: "#{@test_offset}",
+                       handle: @test_shape_handle,
+                       live: true
+                     }
+                   )
 
-      assert [
-               %{
-                 headers: %{control: "up-to-date", global_last_seen_lsn: "#{Lsn.to_integer(lsn)}"}
-               }
-             ] ==
-               response_body(response)
+          Process.exit(status_task, :kill)
+          Electric.StatusMonitor.wait_for_messages_to_be_processed(stack_id)
+          Process.sleep(50)
+
+          Api.serve_shape_response(request)
+        end)
+
+      assert %{status: 503} = Task.await(req_task)
     end
   end
 

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +'@core/sync-service': patch
 +---
++
 +Handle requests during stack shutdown more gracefully