Skip to content

hung state due to race in socket_callback if socket closes just before watch #253

@tanmaykm

Description

@tanmaykm

Discovered this while investigating instances of our application getting into occasional frozen states.

It seems that if the connection closes just when the socket_callback method starts and creates a FDWatcher, it throws an exception which results in the returning an error. That results in the multi instance getting into a bad state. The Downloads.request method hangs. Additionally any other task also invoking Downloads.request (and using the same multi instance) gets into a hung state.

Below is a MWE that simulates the scenario.

   _       _ _(_)_     |  Documentation: https://docs.julialang.org
  (_)     | (_) (_)    |
   _ _   _| |_  __ _   |  Type "?" for help, "]?" for Pkg help.
  | | | | | | |/ _` |  |
  | | |_| | | | (_| |  |  Version 1.10.4 (2024-06-04)
 _/ |\__'_|_|_|\__'_|  |  Official https://julialang.org/ release
|__/                   |


julia> using FileWatching

julia> import Base: OS_HANDLE, preserve_handle, unpreserve_handle

julia> using Downloads

julia> import Downloads.Curl: curl_socket_t, connect_semaphore_acquire, init!,
           Multi,
           Easy,
           @check,
           CURL_POLL_IN, CURL_POLL_OUT, CURL_POLL_INOUT, CURL_POLL_REMOVE,
           CURL_CSELECT_IN, CURL_CSELECT_OUT, CURL_CSELECT_ERR,
           check_multi_info, curl_multi_assign, curl_multi_socket_action, curl_multi_add_handle

julia> _close = Ref{Bool}(true)
Base.RefValue{Bool}(true)

julia> function simulate_closed_socket(sock)
           if _close[]
               ccall(:close, Cint, (Cint,), sock)
           end
       end
simulate_closed_socket (generic function with 1 method)

julia> function Downloads.Curl.socket_callback(
           easy_h    :: Ptr{Cvoid},
           sock      :: curl_socket_t,
           action    :: Cint,
           multi_p   :: Ptr{Cvoid},
           watcher_p :: Ptr{Cvoid},
       )::Cint
           try
               if action ∉ (CURL_POLL_IN, CURL_POLL_OUT, CURL_POLL_INOUT, CURL_POLL_REMOVE)
                   @async @error("socket_callback: unexpected action", action, maxlog=1_000)
                   return -1
               end
               multi = unsafe_pointer_to_objref(multi_p)::Multi
               if watcher_p != C_NULL
                   old_watcher = unsafe_pointer_to_objref(watcher_p)::FDWatcher
                   @check curl_multi_assign(multi.handle, sock, C_NULL)
                   unpreserve_handle(old_watcher)
               end
               if action in (CURL_POLL_IN, CURL_POLL_OUT, CURL_POLL_INOUT)
                   readable = action in (CURL_POLL_IN,  CURL_POLL_INOUT)
                   writable = action in (CURL_POLL_OUT, CURL_POLL_INOUT)
                   os_handle = OS_HANDLE(sock)
                   simulate_closed_socket(sock)
                   watcher = FDWatcher(os_handle, readable, writable)
                   preserve_handle(watcher)
                   watcher_p = pointer_from_objref(watcher)
                   @check curl_multi_assign(multi.handle, sock, watcher_p)
                   task = @async while watcher.readable || watcher.writable # isopen(watcher)
                       events = try
                           wait(watcher)
                       catch err
                           err isa EOFError && return
                           err isa Base.IOError || rethrow()
                           FileWatching.FDEvent()
                       end
                       flags = CURL_CSELECT_IN  * isreadable(events) +
                               CURL_CSELECT_OUT * iswritable(events) +
                               CURL_CSELECT_ERR * (events.disconnect || events.timedout)
                       lock(multi.lock) do
                           watcher.readable || watcher.writable || return # !isopen
                           @check curl_multi_socket_action(multi.handle, sock, flags)
                           check_multi_info(multi)
                       end
                   end
                   @isdefined(errormonitor) && errormonitor(task)
               end
               @isdefined(old_watcher) && close(old_watcher)
               return 0
           catch err
               @async @error("socket_callback: unexpected error", err=err, maxlog=1_000)
               return -1
           end
       end

julia> _close[] = false                     # normal condition
false

julia> t0 = @async Downloads.request("google.com")     # this should go thru normally
Task (runnable) @0x00007f71631cc4c0

julia> wait(t0)

julia> @info("t0 done: $(istaskdone(t0))")
[ Info: t0 done: true

julia> _close[] = true                     # simulate a closed connection
true

julia> t1 = @async Downloads.request("google.com")     # this will cause the task to hang
┌ Error: socket_callback: unexpected error
│   err = IOError: FDWatcher: bad file descriptor (EBADF)
└ @ Main REPL[7]:50
┌ Error: curl_multi_socket_action: 11
└ @ Downloads.Curl /data/Work/julia/binaries/julia-1.10.4/share/julia/stdlib/v1.10/Downloads/src/Curl/utils.jl:57
Task (runnable) @0x00007f70f73de270

julia> sleep(10)

julia> @info("t1 done: $(istaskdone(t1))")
[ Info: t1 done: false

julia> @info("while t1 is hung, any new requests will also get hung")
[ Info: while t1 is hung, any new requests will also get hung

julia> t2 = @async Downloads.request("google.com")     # these will also hang
┌ Error: curl_multi_add_handle: 11
└ @ Downloads.Curl /data/Work/julia/binaries/julia-1.10.4/share/julia/stdlib/v1.10/Downloads/src/Curl/utils.jl:57
Task (runnable) @0x00007f70f6f9f080

julia> t3 = @async Downloads.request("google.com")     # these will also hang
┌ Error: curl_multi_add_handle: 11
└ @ Downloads.Curl /data/Work/julia/binaries/julia-1.10.4/share/julia/stdlib/v1.10/Downloads/src/Curl/utils.jl:57
Task (runnable) @0x00007f70f6f9c330

julia> sleep(10)

julia> @info("t2 done: $(istaskdone(t2))")
[ Info: t2 done: false

julia> @info("t3 done: $(istaskdone(t3))")
[ Info: t3 done: false

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions