Skip to content

Commit 28dce05

Browse files
authored
storage_access: robustify SM detection on startup to avoid duplicates (xapi-project#6873)
Some XCP-ng customers have observed xapi recording duplicate XOStore SM backends. On these cases the `host_pending_feature` map present in both SM objects by the xapi objects were different: one was empty, and the other one had an empty entry for each of the hosts in the pool. This PR tries to address this by detecting the latter case and transforming into the former. It also has changes to the startup process where the processing of the SM objects could miss updating some of them, and now makes a point of handling SM objects and SM types differently to avoid missing as many cases as possible. Since I've been unable to reproduce the issue of duplicates, there's also code added to detect and log duplicates after the processing at the startup has ended to be able to better identify when the situation happens. I've tested the changes manually on a host to make sure there are no strange issues with the SMs after the updated code has been applied. I'll open a PR against master once this one has been reviewed
2 parents 0574392 + ac216e9 commit 28dce05

File tree

5 files changed

+221
-132
lines changed

5 files changed

+221
-132
lines changed

ocaml/libs/xapi-stdext/lib/xapi-stdext-std/listext.mli

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,8 @@ module List : sig
154154
val set_equiv : 'a list -> 'a list -> bool
155155

156156
val set_difference : 'a list -> 'a list -> 'a list
157-
(** Returns the set difference of two lists *)
157+
(** [set_difference a b] returns the list with the elements in [a] that are
158+
not present in [b] *)
158159

159160
val intersect : 'a list -> 'a list -> 'a list
160161
(** Returns the intersection of two lists. *)

ocaml/xapi/storage_access.ml

Lines changed: 149 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ let finally = Xapi_stdext_pervasives.Pervasiveext.finally
2121

2222
module XenAPI = Client.Client
2323
open Storage_interface
24-
open Storage_utils
24+
25+
let transform_storage_exn = Storage_utils.transform_storage_exn
2526

2627
module D = Debug.Make (struct let name = "storage_access" end)
2728

@@ -103,12 +104,72 @@ let external_rpc queue_name uri =
103104
(* Internal exception, never escapes the module *)
104105
exception Message_switch_failure
105106

106-
(* We have to be careful in this function, because an exception raised from
107-
here will cause the startup sequence to fail *)
107+
(* We have to be careful in the call tree of on_xapi_start, because an
108+
exception raised in it will cause the startup sequence to fail *)
109+
110+
let get_smapiv2_drivers_from_switch () =
111+
let module Client = Message_switch_unix.Protocol_unix.Client in
112+
try
113+
let ( >>| ) result f =
114+
match Client.error_to_msg result with
115+
| Error (`Msg x) ->
116+
error "%s: Error %s while querying message switch queues" __FUNCTION__
117+
x ;
118+
raise Message_switch_failure
119+
| Ok x ->
120+
f x
121+
in
122+
Client.connect ~switch:!Xcp_client.switch_path () >>| fun t ->
123+
Client.list ~t ~prefix:!Storage_interface.queue_name ~filter:`Alive ()
124+
>>| fun running_smapiv2_driver_queues ->
125+
running_smapiv2_driver_queues
126+
(* The results include the prefix itself, but that is the main storage
127+
queue, we don't need it *)
128+
|> List.filter (( <> ) !Storage_interface.queue_name)
129+
|> Listext.List.try_map (fun driver ->
130+
(* Get the last component of the queue name:
131+
org.xen.xapi.storage.sr_type -> sr_type *)
132+
driver
133+
|> String.split_on_char '.'
134+
|> Listext.List.last
135+
|> Option.to_result ~none:(Invalid_argument driver)
136+
)
137+
|> function
138+
| Ok drivers ->
139+
drivers
140+
| Error exn ->
141+
raise exn
142+
with
143+
| Message_switch_failure ->
144+
[] (* no more logging *)
145+
| e ->
146+
Backtrace.is_important e ;
147+
error "Unexpected error querying the message switch: %s"
148+
(Printexc.to_string e) ;
149+
Debug.log_backtrace e (Backtrace.get e) ;
150+
[]
151+
152+
let log_and_unregister ~__context ~reason __FUN (self, rc) =
153+
info "%s: unregistering SM plugin %s (%s) since %s" __FUN rc.API.sM_name_label
154+
rc.API.sM_uuid reason ;
155+
try Db.SM.destroy ~__context ~self with _ -> ()
156+
157+
module StringSet = Set.Make (String)
158+
159+
let list_assoc_all a =
160+
List.filter_map (fun (k, v) ->
161+
if String.equal k a then
162+
Some v
163+
else
164+
None
165+
)
166+
167+
let ( let@ ) f x = f x
108168

109169
(** Synchronise the SM table with the SMAPIv1 plugins on the disk and the SMAPIv2
110170
plugins mentioned in the configuration file whitelist. *)
111171
let on_xapi_start ~__context =
172+
let __FUN = __FUNCTION__ in
112173
(* An SM is either implemented as a plugin - for which we check its
113174
presence, or via an API *)
114175
let is_available rc =
@@ -120,107 +181,69 @@ let on_xapi_start ~__context =
120181
|> List.map (fun (rf, rc) -> (rc.API.sM_type, (rf, rc)))
121182
in
122183
let explicitly_configured_drivers =
123-
List.filter_map
124-
(function `Sm x -> Some x | _ -> None)
125-
!Xapi_globs.sm_plugins
184+
!Xapi_globs.sm_plugins
185+
|> List.filter_map (function `Sm x -> Some x | _ -> None)
186+
|> StringSet.of_list
187+
in
188+
let smapiv1_drivers = Sm.supported_drivers () |> StringSet.of_list in
189+
let configured_drivers =
190+
StringSet.union explicitly_configured_drivers smapiv1_drivers
126191
in
127-
let smapiv1_drivers = Sm.supported_drivers () in
128-
let configured_drivers = explicitly_configured_drivers @ smapiv1_drivers in
129192
let in_use_drivers =
130193
List.map (fun (_, rc) -> rc.API.sR_type) (Db.SR.get_all_records ~__context)
194+
|> StringSet.of_list
131195
in
132-
let to_keep = configured_drivers @ in_use_drivers in
133-
(* The SMAPIv2 drivers we know about *)
134-
let smapiv2_drivers = Listext.List.set_difference to_keep smapiv1_drivers in
196+
let to_keep = StringSet.union configured_drivers in_use_drivers in
135197
(* Query the message switch to detect running SMAPIv2 plugins. *)
136198
let running_smapiv2_drivers =
137-
if !Xcp_client.use_switch then (
138-
try
139-
let open Message_switch_unix.Protocol_unix in
140-
let ( >>| ) result f =
141-
match Client.error_to_msg result with
142-
| Error (`Msg x) ->
143-
error "Error %s while querying message switch queues" x ;
144-
raise Message_switch_failure
145-
| Ok x ->
146-
f x
147-
in
148-
Client.connect ~switch:!Xcp_client.switch_path () >>| fun t ->
149-
Client.list ~t ~prefix:!Storage_interface.queue_name ~filter:`Alive ()
150-
>>| fun running_smapiv2_driver_queues ->
151-
running_smapiv2_driver_queues
152-
(* The results include the prefix itself, but that is the main storage
153-
queue, we don't need it *)
154-
|> List.filter (( <> ) !Storage_interface.queue_name)
155-
|> Listext.List.try_map (fun driver ->
156-
(* Get the last component of the queue name:
157-
org.xen.xapi.storage.sr_type -> sr_type *)
158-
driver
159-
|> String.split_on_char '.'
160-
|> Listext.List.last
161-
|> Option.to_result ~none:(Invalid_argument driver)
162-
)
163-
|> function
164-
| Ok drivers ->
165-
drivers
166-
| Error exn ->
167-
raise exn
168-
with
169-
| Message_switch_failure ->
170-
[] (* no more logging *)
171-
| e ->
172-
Backtrace.is_important e ;
173-
error "Unexpected error querying the message switch: %s"
174-
(Printexc.to_string e) ;
175-
Debug.log_backtrace e (Backtrace.get e) ;
176-
[]
177-
) else
178-
smapiv2_drivers
199+
if !Xcp_client.use_switch then
200+
get_smapiv2_drivers_from_switch () |> StringSet.of_list
201+
else (* The SMAPIv2 drivers we know about *)
202+
StringSet.diff to_keep smapiv1_drivers
179203
in
180204
(* Add all the running SMAPIv2 drivers *)
181-
let to_keep = to_keep @ running_smapiv2_drivers in
205+
let to_keep = StringSet.union to_keep running_smapiv2_drivers in
206+
let existing_types = List.map fst existing |> StringSet.of_list in
207+
let unused = StringSet.diff existing_types to_keep in
182208
let unavailable =
183209
List.filter (fun (_, (_, rc)) -> not (is_available rc)) existing
184210
in
185211
(* Delete all records which aren't configured or in-use *)
186-
List.iter
187-
(fun ty ->
188-
info
189-
"Unregistering SM plugin %s since not in the whitelist and not in-use"
190-
ty ;
191-
let self, _ = List.assoc ty existing in
192-
try Db.SM.destroy ~__context ~self with _ -> ()
193-
)
194-
(Listext.List.set_difference (List.map fst existing) to_keep) ;
195-
List.iter
196-
(fun (name, (self, rc)) ->
197-
info "%s: unregistering SM plugin %s (%s) since it is unavailable"
198-
__FUNCTION__ name rc.API.sM_uuid ;
199-
try Db.SM.destroy ~__context ~self with _ -> ()
200-
)
201-
unavailable ;
212+
let unregister_unused ty =
213+
let sms = list_assoc_all ty existing in
214+
let reason = "it's not in the allowed list and not in-use" in
215+
List.iter (log_and_unregister ~__context ~reason __FUNCTION__) sms
216+
in
217+
let unregister_unavailable (_, sm) =
218+
let reason = "it's unavailable" in
219+
log_and_unregister ~__context ~reason __FUNCTION__ sm
220+
in
221+
StringSet.iter unregister_unused unused ;
222+
List.iter unregister_unavailable unavailable ;
202223

203224
(* Synchronize SMAPIv1 plugins *)
204225

205226
(* Create all missing SMAPIv1 plugins *)
206-
List.iter
227+
StringSet.iter
207228
(fun ty ->
208229
let query_result =
209230
Sm.info_of_driver ty |> Smint.query_result_of_sr_driver_info
210231
in
211232
Xapi_sm.create_from_query_result ~__context query_result
212233
)
213-
(Listext.List.set_difference smapiv1_drivers (List.map fst existing)) ;
234+
(StringSet.diff smapiv1_drivers existing_types) ;
214235
(* Update all existing SMAPIv1 plugins *)
215-
List.iter
236+
StringSet.iter
216237
(fun ty ->
217238
let query_result =
218239
Sm.info_of_driver ty |> Smint.query_result_of_sr_driver_info
219240
in
220-
Xapi_sm.update_from_query_result ~__context (List.assoc ty existing)
221-
query_result
241+
list_assoc_all ty existing
242+
|> List.iter (fun sm ->
243+
Xapi_sm.update_from_query_result ~__context sm query_result
244+
)
222245
)
223-
(Listext.List.intersect smapiv1_drivers (List.map fst existing)) ;
246+
(StringSet.inter smapiv1_drivers existing_types) ;
224247

225248
(* Synchronize SMAPIv2 plugins *)
226249

@@ -241,18 +264,62 @@ let on_xapi_start ~__context =
241264
f query_result
242265
)
243266
in
244-
List.iter
267+
StringSet.iter
245268
(fun ty ->
246269
with_query_result ty (Xapi_sm.create_from_query_result ~__context)
247270
)
248-
(Listext.List.set_difference running_smapiv2_drivers (List.map fst existing)) ;
271+
(StringSet.diff running_smapiv2_drivers existing_types) ;
249272
(* Update all existing SMAPIv2 plugins *)
250-
List.iter
273+
StringSet.iter
251274
(fun ty ->
252-
with_query_result ty
253-
(Xapi_sm.update_from_query_result ~__context (List.assoc ty existing))
275+
let@ qr = with_query_result ty in
276+
list_assoc_all ty existing
277+
|> List.iter (fun sm -> Xapi_sm.update_from_query_result ~__context sm qr)
254278
)
255-
(Listext.List.intersect running_smapiv2_drivers (List.map fst existing))
279+
(StringSet.inter running_smapiv2_drivers existing_types) ;
280+
281+
(* Warn in logs when there are still duplicates *)
282+
let add_to_dups (last, dups) (_, curr) =
283+
match (last.API.sM_type = curr.API.sM_type, dups) with
284+
| false, _ ->
285+
(curr, dups)
286+
| true, x :: _ when x = last ->
287+
(curr, curr :: dups)
288+
| true, _ ->
289+
(curr, curr :: last :: dups)
290+
in
291+
let find_all_duplicates lst =
292+
lst
293+
|> List.sort (fun (_, a_rc) (_, b_rc) ->
294+
Stdlib.compare a_rc.API.sM_type b_rc.API.sM_type
295+
)
296+
|> function
297+
| [] ->
298+
[]
299+
| head :: rest ->
300+
List.fold_left add_to_dups (snd head, []) rest |> snd
301+
in
302+
303+
let features_to_string feats =
304+
Fmt.(to_to_string (Dump.list (Dump.pair string int64)) feats)
305+
in
306+
let plugin_to_string plugin =
307+
Printf.sprintf "{ type:%s; name:%s; UUID:%s; features:%s; }"
308+
plugin.API.sM_type plugin.API.sM_name_label plugin.API.sM_uuid
309+
(features_to_string plugin.API.sM_features)
310+
in
311+
let log_plugins = function
312+
| [] ->
313+
()
314+
| duplicates ->
315+
let duplicates =
316+
String.concat "\n; " (List.map plugin_to_string duplicates)
317+
in
318+
warn "%s: found duplicate SM plugins for the same type: [\n %s\n]"
319+
__FUN duplicates
320+
in
321+
322+
Db.SM.get_all_records ~__context |> find_all_duplicates |> log_plugins
256323

257324
let bind ~__context ~pbd =
258325
let dbg = Context.string_of_task __context in

0 commit comments

Comments
 (0)