Skip to content

Commit adf2c5b

Browse files
rst0gitavagin
authored andcommitted
images/inventory: add field for enabled plugins
This patch extends the inventory image with a `plugins` field that contains an array of plugins which were used during checkpoint, for example, to save GPU state. In particular, the CUDA and AMDGPU plugins are added to this field only when the checkpoint contains GPU state. This allows to disable unnecessary plugins during restore, show appropriate error messages if required CRIU plugin are missing, and migrate a process that does not use GPU from a GPU-enabled system to CPU-only environment. We use the `optional plugins_entry` for backwards compatibility. This entry allows us to distinguish between *unset* and *missing* field: - When the field is missing, it indicates that the checkpoint was created with a previous version of CRIU, and all plugins should be *enabled* during restore. - When the field is empty, it indicates that no plugins were used during checkpointing. Thus, all plugins can be *disabled* during restore. Signed-off-by: Radostin Stoyanov <[email protected]>
1 parent 87b5ac9 commit adf2c5b

File tree

7 files changed

+193
-5
lines changed

7 files changed

+193
-5
lines changed

criu/cr-restore.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2354,12 +2354,12 @@ int cr_restore_tasks(void)
23542354
if (init_service_fd())
23552355
return 1;
23562356

2357-
if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
2358-
return -1;
2359-
23602357
if (check_img_inventory(/* restore = */ true) < 0)
23612358
goto err;
23622359

2360+
if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
2361+
return -1;
2362+
23632363
if (init_stats(RESTORE_STATS))
23642364
goto err;
23652365

criu/image.c

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,14 @@ TaskKobjIdsEntry *root_ids;
2626
u32 root_cg_set;
2727
Lsmtype image_lsm;
2828

29+
struct inventory_plugin {
30+
struct list_head node;
31+
char *name;
32+
};
33+
34+
struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list);
35+
static int n_inventory_plugins;
36+
2937
int check_img_inventory(bool restore)
3038
{
3139
int ret = -1;
@@ -99,6 +107,19 @@ int check_img_inventory(bool restore)
99107
} else {
100108
opts.network_lock_method = he->network_lock_method;
101109
}
110+
111+
if (!he->plugins_entry) {
112+
/* backwards compatibility: if the 'plugins_entry' field is missing,
113+
* all plugins should be enabled during restore.
114+
*/
115+
n_inventory_plugins = -1;
116+
} else {
117+
PluginsEntry *pe = he->plugins_entry;
118+
for (int i = 0; i < pe->n_plugins; i++) {
119+
if (add_inventory_plugin(pe->plugins[i]))
120+
goto out_err;
121+
}
122+
}
102123
}
103124

104125
ret = 0;
@@ -110,8 +131,92 @@ int check_img_inventory(bool restore)
110131
return ret;
111132
}
112133

134+
/**
135+
* Check if the 'plugins' field in the inventory image contains
136+
* the specified plugin name. If found, the plugin is removed
137+
* from the linked list.
138+
*/
139+
bool check_and_remove_inventory_plugin(const char *name, size_t n)
140+
{
141+
if (n_inventory_plugins == -1)
142+
return true; /* backwards compatibility */
143+
144+
if (n_inventory_plugins > 0) {
145+
struct inventory_plugin *p, *tmp;
146+
147+
list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) {
148+
if (!strncmp(name, p->name, n)) {
149+
xfree(p->name);
150+
list_del(&p->node);
151+
xfree(p);
152+
n_inventory_plugins--;
153+
return true;
154+
}
155+
}
156+
}
157+
158+
return false;
159+
}
160+
161+
/**
162+
* We expect during restore all loaded plugins to be removed from
163+
* the inventory_plugins_list. If the list is not empty, show an
164+
* error message for each missing plugin.
165+
*/
166+
int check_inventory_plugins(void)
167+
{
168+
struct inventory_plugin *p;
169+
170+
if (n_inventory_plugins <= 0)
171+
return 0;
172+
173+
list_for_each_entry(p, &inventory_plugins_list, node) {
174+
pr_err("Missing required plugin: %s\n", p->name);
175+
}
176+
177+
return -1;
178+
}
179+
180+
/**
181+
* Add plugin name to the inventory image. These values
182+
* can be used to identify required plugins during restore.
183+
*/
184+
int add_inventory_plugin(const char *name)
185+
{
186+
struct inventory_plugin *p;
187+
188+
p = xmalloc(sizeof(struct inventory_plugin));
189+
if (p == NULL)
190+
return -1;
191+
192+
p->name = xstrdup(name);
193+
if (!p->name) {
194+
xfree(p);
195+
return -1;
196+
}
197+
list_add(&p->node, &inventory_plugins_list);
198+
n_inventory_plugins++;
199+
200+
return 0;
201+
}
202+
203+
void free_inventory_plugins_list(void)
204+
{
205+
struct inventory_plugin *p, *tmp;
206+
207+
if (!list_empty(&inventory_plugins_list)) {
208+
list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) {
209+
xfree(p->name);
210+
list_del(&p->node);
211+
xfree(p);
212+
}
213+
}
214+
n_inventory_plugins = 0;
215+
}
216+
113217
int write_img_inventory(InventoryEntry *he)
114218
{
219+
PluginsEntry pe = PLUGINS_ENTRY__INIT;
115220
struct cr_img *img;
116221
int ret;
117222

@@ -121,8 +226,27 @@ int write_img_inventory(InventoryEntry *he)
121226
if (!img)
122227
return -1;
123228

229+
if (!list_empty(&inventory_plugins_list)) {
230+
struct inventory_plugin *p;
231+
int i = 0;
232+
233+
pe.n_plugins = n_inventory_plugins;
234+
pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *));
235+
if (!pe.plugins)
236+
return -1;
237+
238+
list_for_each_entry(p, &inventory_plugins_list, node) {
239+
pe.plugins[i] = p->name;
240+
i++;
241+
}
242+
}
243+
he->plugins_entry = &pe;
244+
124245
ret = pb_write_one(img, he, PB_INVENTORY);
125246

247+
free_inventory_plugins_list();
248+
xfree(pe.plugins);
249+
126250
xfree(he->root_ids);
127251
close_image(img);
128252
if (ret < 0)

criu/include/image.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,4 +177,8 @@ extern int read_img_str(struct cr_img *, char **pstr, int size);
177177

178178
extern void close_image(struct cr_img *);
179179

180+
extern int add_inventory_plugin(const char *name);
181+
extern int check_inventory_plugins(void);
182+
extern bool check_and_remove_inventory_plugin(const char *name, size_t n);
183+
180184
#endif /* __CR_IMAGE_H__ */

criu/plugin.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,9 @@ int cr_plugin_init(int stage)
256256
goto err;
257257
}
258258

259+
if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins())
260+
goto err;
261+
259262
exit_code = 0;
260263
err:
261264
closedir(d);

images/inventory.proto

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@ enum lsmtype {
1010
APPARMOR = 2;
1111
}
1212

13+
// It is not possible to distinguish between an empty repeated field
14+
// and unset repeated field. To solve this problem and provide backwards
15+
// compabibility, we use the 'plugins_entry' message.
16+
message plugins_entry {
17+
repeated string plugins = 12;
18+
};
19+
1320
message inventory_entry {
1421
required uint32 img_version = 1;
1522
optional bool fdinfo_per_id = 2;
@@ -21,4 +28,5 @@ message inventory_entry {
2128
optional uint32 pre_dump_mode = 9;
2229
optional bool tcp_close = 10;
2330
optional uint32 network_lock_method = 11;
31+
optional plugins_entry plugins_entry = 12;
2432
}

plugins/amdgpu/amdgpu_plugin.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ static LIST_HEAD(update_vma_info_list);
6060

6161
size_t kfd_max_buffer_size;
6262

63+
bool plugin_added_to_inventory = false;
64+
65+
bool plugin_disabled = false;
66+
6367
/**************************************************************************************************/
6468

6569
/* Call ioctl, restarting if it is interrupted */
@@ -332,6 +336,13 @@ void getenv_size_t(const char *var, size_t *value)
332336

333337
int amdgpu_plugin_init(int stage)
334338
{
339+
if (stage == CR_PLUGIN_STAGE__RESTORE) {
340+
if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) {
341+
plugin_disabled = true;
342+
return 0;
343+
}
344+
}
345+
335346
pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
336347

337348
topology_init(&src_topology);
@@ -365,6 +376,9 @@ int amdgpu_plugin_init(int stage)
365376

366377
void amdgpu_plugin_fini(int stage, int ret)
367378
{
379+
if (plugin_disabled)
380+
return;
381+
368382
pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
369383

370384
if (stage == CR_PLUGIN_STAGE__RESTORE)
@@ -414,6 +428,14 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
414428
if (ret)
415429
pr_perror("%s(), Can't handle VMAs of input device", __func__);
416430

431+
if (!ret && !plugin_added_to_inventory) {
432+
ret = add_inventory_plugin(CR_PLUGIN_DESC.name);
433+
if (ret)
434+
pr_err("Failed to add AMDGPU plugin to inventory image\n");
435+
else
436+
plugin_added_to_inventory = true;
437+
}
438+
417439
return ret;
418440
}
419441
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma)
@@ -1540,6 +1562,9 @@ int amdgpu_plugin_restore_file(int id)
15401562
size_t img_size;
15411563
FILE *img_fp = NULL;
15421564

1565+
if (plugin_disabled)
1566+
return -ENOTSUP;
1567+
15431568
pr_info("Initialized kfd plugin restorer with ID = %d\n", id);
15441569

15451570
snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id);
@@ -1746,6 +1771,9 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const
17461771
char *p_end;
17471772
bool is_kfd = false, is_renderD = false;
17481773

1774+
if (plugin_disabled)
1775+
return -ENOTSUP;
1776+
17491777
plugin_log_msg("Enter %s\n", __func__);
17501778

17511779
strncpy(path, in_path, sizeof(path));
@@ -1805,6 +1833,9 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
18051833
struct kfd_ioctl_criu_args args = { 0 };
18061834
int fd, exit_code = 0;
18071835

1836+
if (plugin_disabled)
1837+
return -ENOTSUP;
1838+
18081839
pr_info("Inside %s for target pid = %d\n", __func__, target_pid);
18091840

18101841
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);

plugins/cuda/cuda_plugin.c

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
*/
3939
bool plugin_disabled = false;
4040

41+
bool plugin_added_to_inventory = false;
42+
4143
struct pid_info {
4244
int pid;
4345
char checkpointed;
@@ -319,7 +321,7 @@ int cuda_plugin_checkpoint_devices(int pid)
319321
k_rtsigset_t save_sigset;
320322

321323
if (plugin_disabled) {
322-
return 0;
324+
return -ENOTSUP;
323325
}
324326

325327
restore_tid = get_cuda_restore_tid(pid);
@@ -354,6 +356,15 @@ int cuda_plugin_checkpoint_devices(int pid)
354356
pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid);
355357
}
356358
}
359+
360+
if (!status && !plugin_added_to_inventory) {
361+
status = add_inventory_plugin(CR_PLUGIN_DESC.name);
362+
if (status)
363+
pr_err("Failed to add CUDA plugin to inventory image\n");
364+
else
365+
plugin_added_to_inventory = true;
366+
}
367+
357368
interrupt:
358369
int_ret = interrupt_restore_thread(restore_tid, &save_sigset);
359370

@@ -367,7 +378,7 @@ int cuda_plugin_pause_devices(int pid)
367378
char msg_buf[CUDA_CKPT_BUF_SIZE];
368379

369380
if (plugin_disabled) {
370-
return 0;
381+
return -ENOTSUP;
371382
}
372383

373384
restore_tid = get_cuda_restore_tid(pid);
@@ -463,6 +474,13 @@ int cuda_plugin_init(int stage)
463474
{
464475
int ret;
465476

477+
if (stage == CR_PLUGIN_STAGE__RESTORE) {
478+
if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) {
479+
plugin_disabled = true;
480+
return 0;
481+
}
482+
}
483+
466484
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
467485
pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
468486
plugin_disabled = true;

0 commit comments

Comments
 (0)