From 5a8c2ac42e67d66a714753c9e70d4258d9cab737 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Mon, 3 Nov 2025 12:30:04 -0500 Subject: [PATCH 1/3] vine: param return-recovery-tasks --- taskvine/src/manager/vine_manager.c | 11 ++++++++++- taskvine/src/manager/vine_manager.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/taskvine/src/manager/vine_manager.c b/taskvine/src/manager/vine_manager.c index ca72d257d9..5a90961b73 100644 --- a/taskvine/src/manager/vine_manager.c +++ b/taskvine/src/manager/vine_manager.c @@ -4153,6 +4153,8 @@ struct vine_manager *vine_ssl_create(int port, const char *key, const char *cert q->sandbox_grow_factor = 2.0; q->disk_proportion_available_to_task = 0.75; + q->return_recovery_tasks = 0; + q->stats = calloc(1, sizeof(struct vine_stats)); q->stats_measure = calloc(1, sizeof(struct vine_stats)); @@ -5232,7 +5234,11 @@ struct vine_task *find_task_to_return(struct vine_manager *q, const char *tag, i return t; break; case VINE_TASK_TYPE_RECOVERY: - /* do nothing and let vine_manager_consider_recovery_task do its job */ + /* if configured to return recovery tasks, return it to the user and let them take care of it. */ + if (q->return_recovery_tasks) { + return t; + } + /* otherwise, do nothing and let vine_manager_consider_recovery_task do its job */ break; case VINE_TASK_TYPE_LIBRARY_INSTANCE: /* silently delete the task, since it was created by the manager. @@ -5980,6 +5986,9 @@ int vine_tune(struct vine_manager *q, const char *name, double value) } else if (!strcmp(name, "enforce-worker-eviction-interval")) { q->enforce_worker_eviction_interval = (timestamp_t)(MAX(0, (int)value) * ONE_SECOND); + } else if (!strcmp(name, "return-recovery-tasks")) { + q->return_recovery_tasks = !!((int)value); + } else { debug(D_NOTICE | D_VINE, "Warning: tuning parameter \"%s\" not recognized\n", name); return -1; diff --git a/taskvine/src/manager/vine_manager.h b/taskvine/src/manager/vine_manager.h index bcf2405616..c22846b49c 100644 --- a/taskvine/src/manager/vine_manager.h +++ b/taskvine/src/manager/vine_manager.h @@ -231,6 +231,7 @@ struct vine_manager { double sandbox_grow_factor; /* When task disk sandboxes are exhausted, increase the allocation using their measured valued times this factor */ double disk_proportion_available_to_task; /* intentionally reduces disk allocation for tasks to reserve some space for cache growth. */ + int return_recovery_tasks; /* If true, recovery tasks are returned by vine_wait to the user. By default 0 and they are handled internally. */ /* todo: confirm datatype. int or int64 */ int max_task_stdout_storage; /* Maximum size of standard output from task. (If larger, send to a separate file.) */ From 8137dda3e42d3022724bb7313c6618ae521cf551 Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Thu, 6 Nov 2025 13:49:15 -0500 Subject: [PATCH 2/3] update mannual --- doc/manuals/taskvine/index.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/manuals/taskvine/index.md b/doc/manuals/taskvine/index.md index a3d2617784..3ddf157a3e 100644 --- a/doc/manuals/taskvine/index.md +++ b/doc/manuals/taskvine/index.md @@ -2777,6 +2777,8 @@ change. | worker-retrievals | If 1, retrieve all completed tasks from a worker when retrieving results, even if going above the parameter max-retrievals . Otherwise, if 0, retrieve just one task before deciding to dispatch new tasks or connect new workers. | 1 | | watch-library-logfiles | If 1, watch the output files produced by each of the library processes running on the remote workers, take them back the current logging directory. | 0 | +| return-recovery-tasks | If set to 1, recovery tasks are returned to the application via `wait` instead of being handled internally. This is not expected in regular applications, the caller is aware of such behavior and should take care of it. | 0 | + === "Python" ```python From 60ff0620b3669c6a9e70118afd0d5cf1c90da5bb Mon Sep 17 00:00:00 2001 From: Jin Zhou Date: Thu, 6 Nov 2025 14:07:40 -0500 Subject: [PATCH 3/3] change line --- doc/manuals/taskvine/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/manuals/taskvine/index.md b/doc/manuals/taskvine/index.md index 3ddf157a3e..e84c1b5fce 100644 --- a/doc/manuals/taskvine/index.md +++ b/doc/manuals/taskvine/index.md @@ -2766,6 +2766,7 @@ change. | proportional-whole-tasks | Round up resource proportions such that only an integer number of tasks could be fit in the worker. The default is to use proportions. (See [task resources.](#task-resources) | 1 | | ramp-down-heuristic | If set to 1 and there are more workers than tasks waiting, then tasks are allocated all the free resources of a worker large enough to run them. If monitoring watchdog is not enabled, then this heuristic has no effect. | 0 | | resource-submit-multiplier | Assume that workers have `resource x resources-submit-multiplier` available.
This overcommits resources at the worker, causing tasks to be sent to workers that cannot be immediately executed.
The extra tasks wait at the worker until resources become available. | 1 | +| return-recovery-tasks | If set to 1, recovery tasks are returned to the application via `wait` instead of being handled internally. This is not expected in regular applications, the caller is aware of such behavior and should take care of it. | 0 | | sandbox-grow-factor | When task disk sandboxes are exhausted, increase the allocation using their measured valued times this factor. Minimum is 1.1. | 2 | | short-timeout | Set the minimum timeout in seconds when sending a brief message to a single worker. | 5 | | temp-replica-count | Number of temp file replicas created across workers | 0 | @@ -2777,7 +2778,6 @@ change. | worker-retrievals | If 1, retrieve all completed tasks from a worker when retrieving results, even if going above the parameter max-retrievals . Otherwise, if 0, retrieve just one task before deciding to dispatch new tasks or connect new workers. | 1 | | watch-library-logfiles | If 1, watch the output files produced by each of the library processes running on the remote workers, take them back the current logging directory. | 0 | -| return-recovery-tasks | If set to 1, recovery tasks are returned to the application via `wait` instead of being handled internally. This is not expected in regular applications, the caller is aware of such behavior and should take care of it. | 0 | === "Python"