Skip to content

Commit 87104b7

Browse files
authored
Fix removal sequence to avoid some rare memory errors: (#4224)
- vine_cache_check_file should *not* remove the object, because some callers expect it to be valid. - vine_cache_check_files *should* remove because it has no further use for the object.
1 parent 099cd2a commit 87104b7

File tree

3 files changed

+30
-18
lines changed

3 files changed

+30
-18
lines changed

taskvine/src/worker/vine_cache.c

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ struct vine_cache {
4444
int max_transfer_procs;
4545
};
4646

47-
static void vine_cache_wait_for_file(struct vine_cache *c, struct vine_cache_file *f, const char *cachename, struct link *manager);
47+
static void vine_cache_check_file(struct vine_cache *c, struct vine_cache_file *f, const char *cachename, struct link *manager);
4848

4949
/*
5050
Create the cache manager structure for a given cache directory.
@@ -151,6 +151,7 @@ void vine_cache_prune(struct vine_cache *c, vine_cache_level_t level)
151151

152152
/*
153153
Kill off any process associated with this file object.
154+
The cache_file object is still valid afterwards.
154155
Used by both vine_cache_remove and vine_cache_delete.
155156
*/
156157

@@ -159,9 +160,9 @@ static void vine_cache_kill(struct vine_cache *c, struct vine_cache_file *f, con
159160
while (f->status == VINE_CACHE_STATUS_PROCESSING) {
160161
debug(D_VINE, "cache: killing pending transfer process %d...", f->pid);
161162
kill(f->pid, SIGKILL);
162-
vine_cache_wait_for_file(c, f, cachename, manager);
163+
vine_cache_check_file(c, f, cachename, manager);
163164
if (f->status == VINE_CACHE_STATUS_PROCESSING) {
164-
debug(D_VINE, "cache:still not killed, trying again!");
165+
debug(D_VINE, "cache: still not killed, trying again!");
165166
sleep(1);
166167
}
167168
}
@@ -171,7 +172,7 @@ static void vine_cache_kill(struct vine_cache *c, struct vine_cache_file *f, con
171172
Process pending transfers until we reach the maximum number of processing transfers or there are no more pending transfers.
172173
*/
173174

174-
int vine_cache_process_pending_transfers(struct vine_cache *c)
175+
int vine_cache_start_transfers(struct vine_cache *c)
175176
{
176177
int processed = 0;
177178

@@ -413,6 +414,7 @@ int vine_cache_remove(struct vine_cache *c, const char *cachename, struct link *
413414
/* Manager's replica state machine expects a response to every unlink message.
414415
* Other states except PENDING have already sent messages, either cache-update or cache-invalid,
415416
* so we only send cache-invalid for transfers in PENDING state. */
417+
416418
if (f->status == VINE_CACHE_STATUS_PENDING) {
417419
char *msg = string_format("File '%s' removed in PENDING state.", cachename);
418420
vine_worker_send_cache_invalid(manager, cachename, msg);
@@ -777,6 +779,7 @@ vine_cache_status_t vine_cache_ensure(struct vine_cache *c, const char *cachenam
777779

778780
/*
779781
Check the outputs of a transfer process to make sure they are valid.
782+
Will send either a cache-update or a cache-invalid depending on the outcome.
780783
*/
781784

782785
static void vine_cache_check_outputs(struct vine_cache *c, struct vine_cache_file *f, const char *cachename, struct link *manager)
@@ -903,9 +906,11 @@ static void vine_cache_handle_exit_status(struct vine_cache *c, struct vine_cach
903906

904907
/*
905908
Consider one cache table entry to determine if the transfer process has completed.
909+
If the transfer completed or failed, a cache-update or cache-invalid will be sent.
910+
Regardless of the outcome, the cache file object will still be valid.
906911
*/
907912

908-
static void vine_cache_wait_for_file(struct vine_cache *c, struct vine_cache_file *f, const char *cachename, struct link *manager)
913+
static void vine_cache_check_file(struct vine_cache *c, struct vine_cache_file *f, const char *cachename, struct link *manager)
909914
{
910915
int status;
911916
if (f->status == VINE_CACHE_STATUS_PROCESSING) {
@@ -919,27 +924,33 @@ static void vine_cache_wait_for_file(struct vine_cache *c, struct vine_cache_fil
919924
vine_cache_handle_exit_status(c, f, cachename, status);
920925
vine_cache_check_outputs(c, f, cachename, manager);
921926

922-
if (f->status == VINE_CACHE_STATUS_FAILED) {
923-
/* if transfer failed, then we delete all of our records of the file. The manager
924-
* assumes that the file is not at the worker after the manager receives
925-
* the cache invalid message sent from vine_cache_check_outputs. */
926-
vine_cache_remove(c, cachename, manager);
927-
}
927+
/* If the transfer failed, the vine_cache_file still remains here in the FAILED state. */
928+
/* Various functions expect the vine_cache_file object to exist still. */
928929
}
929930
}
930931
}
931932

932933
/*
933934
Search the cache table to determine if any transfer processes have completed.
935+
If any have definitively failed, they are removed from the cache.
934936
*/
935937

936-
int vine_cache_wait(struct vine_cache *c, struct link *manager)
938+
int vine_cache_check_files(struct vine_cache *c, struct link *manager)
937939
{
938940
struct vine_cache_file *f;
939941
char *cachename;
940942
HASH_TABLE_ITERATE(c->table, cachename, f)
941943
{
942-
vine_cache_wait_for_file(c, f, cachename, manager);
944+
vine_cache_check_file(c, f, cachename, manager);
945+
946+
if (f->status == VINE_CACHE_STATUS_FAILED) {
947+
/* if transfer failed, then we delete all of our records of the file. The manager
948+
* assumes that the file is not at the worker after the manager receives
949+
* the cache invalid message sent from vine_cache_check_outputs. */
950+
vine_cache_remove(c, cachename, manager);
951+
}
952+
953+
/* Note that f may not longer be valid at this point */
943954
}
944955
return 1;
945956
}

taskvine/src/worker/vine_cache.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ int vine_cache_add_mini_task( struct vine_cache *c, const char *cachename, const
6262
vine_cache_status_t vine_cache_ensure( struct vine_cache *c, const char *cachename);
6363
int vine_cache_remove( struct vine_cache *c, const char *cachename, struct link *manager );
6464
int vine_cache_contains( struct vine_cache *c, const char *cachename );
65-
int vine_cache_wait( struct vine_cache *c, struct link *manager );
66-
int vine_cache_process_pending_transfers(struct vine_cache *c);
65+
66+
int vine_cache_check_files( struct vine_cache *c, struct link *manager );
67+
int vine_cache_start_transfers(struct vine_cache *c);
6768

6869
#endif

taskvine/src/worker/vine_worker.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1713,7 +1713,7 @@ static void vine_worker_serve_manager(struct link *manager)
17131713
expire_procs_running();
17141714

17151715
ok &= handle_completed_tasks(manager);
1716-
ok &= vine_cache_wait(cache_manager, manager);
1716+
ok &= vine_cache_check_files(cache_manager, manager);
17171717

17181718
measure_worker_resources();
17191719

@@ -1737,8 +1737,8 @@ static void vine_worker_serve_manager(struct link *manager)
17371737
break;
17381738
}
17391739

1740-
/* Periodically process pending transfers. */
1741-
vine_cache_process_pending_transfers(cache_manager);
1740+
/* Periodically start some pending transfers. */
1741+
vine_cache_start_transfers(cache_manager);
17421742

17431743
/* Check all known libraries if they are ready to execute functions. */
17441744
check_libraries_ready(manager);

0 commit comments

Comments
 (0)