Skip to content

Commit de90383

Browse files
authored
Merge pull request #17468 from MinaProtocol/lyh/fix-lmdb-deadlock
Fix LMDB deadlock by not accessing LMDB in GC interrupt
2 parents ca2e5bd + 8a787ef commit de90383

File tree

5 files changed

+58
-22
lines changed

5 files changed

+58
-22
lines changed

changes/17468.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fixed a bug that's blocking release 3.1.2: In GC interrupt, hooks into LMDB causes double lock requiring.

src/lib/disk_cache/lmdb/disk_cache.ml

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,63 @@ module Make (Data : Binable.S) = struct
1616

1717
module Rw = Read_write (F)
1818

19-
type t = { env : Rw.t; db : Rw.holder; counter : int ref }
19+
type t =
20+
{ env : Rw.t
21+
; db : Rw.holder
22+
; counter : int ref
23+
; logger : Logger.t
24+
; garbage : int Hash_set.t
25+
(** A list of ids that are no longer reachable from OCaml's side *)
26+
}
27+
28+
(** How big can the above hashset be before we do a cleanup *)
29+
let garbage_size_limit = 512
2030

2131
let initialize path ~logger =
2232
Async.Deferred.Result.map (Disk_cache_utils.initialize_dir path ~logger)
2333
~f:(fun path ->
2434
let env, db = Rw.create path in
25-
{ env; db; counter = ref 0 } )
35+
{ env
36+
; db
37+
; counter = ref 0
38+
; logger
39+
; garbage = Hash_set.create (module Int)
40+
} )
2641

2742
type id = { idx : int }
2843

29-
let get ({ env; db; _ } : t) ({ idx } : id) : Data.t =
44+
let get ({ env; db; logger; _ } : t) ({ idx } : id) : Data.t =
45+
[%log debug] "Getting data at %d in LMDB cache" idx
46+
~metadata:[ ("index", `Int idx) ] ;
3047
Rw.get ~env db idx |> Option.value_exn
3148

32-
let put ({ env; db; counter } : t) (x : Data.t) : id =
49+
let put ({ env; db; counter; logger; garbage } : t) (x : Data.t) : id =
50+
(* TODO: we may reuse IDs by pulling them from the `garbage` hash set *)
3351
let idx = !counter in
3452
incr counter ;
3553
let res = { idx } in
3654
(* When this reference is GC'd, delete the file. *)
37-
Gc.Expert.add_finalizer_last_exn res (fun () -> Rw.remove ~env db idx) ;
55+
Gc.Expert.add_finalizer_last_exn res (fun () ->
56+
(* The actual deletion is delayed, as GC maybe triggered in LMDB's
57+
critical section. LMDB critical section then will be re-entered if
58+
it's invoked directly in a GC hook.
59+
This causes mutex double-acquiring and node freezes. *)
60+
[%log spam] "Data at %d is GCed, marking as garbage" idx
61+
~metadata:[ ("index", `Int idx) ] ;
62+
Hash_set.add garbage idx ) ;
63+
if Hash_set.length garbage >= garbage_size_limit then (
64+
Hash_set.iter garbage ~f:(fun to_remove ->
65+
[%log spam] "Instructing LMDB to remove garbage at index %d" to_remove
66+
~metadata:[ ("index", `Int to_remove) ] ;
67+
Rw.remove ~env db to_remove ) ;
68+
Hash_set.clear garbage ) ;
3869
Rw.set ~env db idx x ;
3970
res
4071

41-
let iteri ({ env; db; _ } : t) ~f = Rw.iter ~env db ~f
72+
let iteri ({ env; db; logger; _ } : t) ~f =
73+
Rw.iter ~env db ~f:(fun k v ->
74+
[%log spam] "Iterating at index %d" k ~metadata:[ ("index", `Int k) ] ;
75+
f k v )
4276

4377
let count ({ env; db; _ } : t) =
4478
let sum = ref 0 in
@@ -52,7 +86,7 @@ let%test_module "disk_cache lmdb" =
5286
( module struct
5387
include Disk_cache_test_lib.Make_extended (Make)
5488

55-
let%test_unit "remove data on gc" = remove_data_on_gc ()
89+
let%test_unit "remove data on gc" = remove_data_on_gc ~gc_strict:false ()
5690

5791
let%test_unit "simple read/write (with iteration)" =
5892
simple_write_with_iteration ()

src/lib/disk_cache/lmdb/dune

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
disk_cache.utils
1212
disk_cache.test_lib)
1313
(preprocess
14-
(pps ppx_version ppx_jane))
14+
(pps ppx_version ppx_jane ppx_mina))
1515
(inline_tests
1616
(flags -verbose -show-counts))
1717
(instrumentation

src/lib/disk_cache/test/test_lmdb_deadlock.ml

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,7 @@ let () =
99
(module Disk_cache)
1010
in
1111
match res with
12-
| `Timeout ->
13-
printf
14-
"It is expected that LMDB cache times out for now. This should be \
15-
fixed." ;
16-
Deferred.unit
1712
| `Success ->
18-
failwith "The process should time out" )
13+
printf "Success" ; Deferred.unit
14+
| `Timeout ->
15+
failwith "The process should not time out" )

src/lib/disk_cache/test_lib/disk_cache_test_lib.ml

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,11 @@ module type S = sig
2323

2424
val initialization_special_cases : unit -> unit
2525

26-
val remove_data_on_gc : unit -> unit
26+
(** [remove_data_on_gc ?gc_strict ())] test behavior of cache on GC.
27+
If [gc_strict] is set to [false], then we won't check if the cache is
28+
empty after GC.
29+
*)
30+
val remove_data_on_gc : ?gc_strict:bool -> unit -> unit
2731
end
2832

2933
module type S_extended = sig
@@ -77,7 +81,7 @@ module Make_impl (Cache : Disk_cache_intf.S_with_count with module Data := Mock)
7781
File_system.with_temp_dir "disk_cache"
7882
~f:(simple_write_impl ?additional_checks)
7983

80-
let remove_data_on_gc_impl tmp_dir =
84+
let remove_data_on_gc_impl ~gc_strict tmp_dir =
8185
let%map cache = initialize_cache_or_fail tmp_dir ~logger in
8286

8387
let proof = Mock.{ proof = "dummy" } in
@@ -91,16 +95,16 @@ module Make_impl (Cache : Disk_cache_intf.S_with_count with module Data := Mock)
9195
[%test_eq: string] proof.proof proof_from_cache.proof
9296
~message:"invalid proof from cache" ) ;
9397

94-
Gc.compact () ;
98+
if gc_strict then (
99+
Gc.compact () ;
100+
[%test_eq: int] (Cache.count cache) 0
101+
~message:"cache should be empty after garbage collector run" )
95102

96-
[%test_eq: int] (Cache.count cache) 0
97-
~message:"cache should be empty after garbage collector run"
98-
99-
let remove_data_on_gc () =
103+
let remove_data_on_gc ?(gc_strict = true) () =
100104
Async.Thread_safe.block_on_async_exn
101105
@@ fun () ->
102106
File_system.with_temp_dir "disk_cache-remove_data_on_gc"
103-
~f:remove_data_on_gc_impl
107+
~f:(remove_data_on_gc_impl ~gc_strict)
104108

105109
let initialize_and_expect_failure path ~logger =
106110
let%bind cache_res = Cache.initialize path ~logger in

0 commit comments

Comments
 (0)