Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 45 additions & 26 deletions taskvine/src/manager/vine_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -935,6 +935,49 @@ static void cleanup_worker_files(struct vine_manager *q, struct vine_worker_info
hash_table_free_keys_array(cachenames);
}

/** Evict a random worker to simulate a failure. */
int vine_manager_evict_a_random_worker(struct vine_manager *q)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's all it release_random_worker. Eviction means to kick off a worker from a compute node.

{
if (!q) {
return 0;
}

if (hash_table_size(q->worker_table) == 0) {
return 0;
}

int removed = 0;

/* collect removable workers */
struct list *candidates_list = list_create();
char *key;
struct vine_worker_info *w;
HASH_TABLE_ITERATE(q->worker_table, key, w)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is hash table random start. You can use it instead of creating a list of all the workers, which could be expensive.

{
list_push_tail(candidates_list, w);
}

/* release a random worker if any */
int random_number = random_int64();
if (random_number < 0) {
random_number = -random_number;
}
int index = (int)(random_number % list_size(candidates_list));
int i = 0;
while ((w = list_pop_head(candidates_list))) {
if (i++ == index) {
/* evict this worker */
debug(D_VINE | D_NOTICE, "Intentionally evicting worker %s", w->hostname);
release_worker(q, w);
removed = 1;
break;
}
}

list_delete(candidates_list);
return removed;
}

/*
This function enforces a target worker eviction rate (1 every X seconds).
If the observed eviction interval is shorter than the desired one, we randomly evict one worker
Expand Down Expand Up @@ -973,32 +1016,8 @@ static int enforce_worker_eviction_interval(struct vine_manager *q)
return 0;
}

/* collect removable workers */
struct list *candidates_list = list_create();
char *key;
struct vine_worker_info *w;
HASH_TABLE_ITERATE(q->worker_table, key, w)
{
if (w->type != VINE_WORKER_TYPE_WORKER) {
continue;
}
list_push_tail(candidates_list, w);
}

/* release a random worker if any */
int index = (int)(random_int64() % list_size(candidates_list));
int i = 0;
while ((w = list_pop_head(candidates_list))) {
if (i++ == index) {
/* evict this worker */
debug(D_VINE | D_NOTICE, "Intentionally evicting worker %s", w->hostname);
release_worker(q, w);
break;
}
}
list_delete(candidates_list);

return 1;
/* evict a random worker if any */
return vine_manager_evict_a_random_worker(q);
}

/* Remove all tasks and other associated state from a given worker. */
Expand Down
3 changes: 3 additions & 0 deletions taskvine/src/manager/vine_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,9 @@ void vine_manager_remove_worker(struct vine_manager *q, struct vine_worker_info
/* Check if the worker is able to transfer the necessary files for this task. */
int vine_manager_transfer_capacity_available(struct vine_manager *q, struct vine_worker_info *w, struct vine_task *t);

/** Evict a random worker to simulate a failure. */
int vine_manager_evict_a_random_worker(struct vine_manager *q);

/* The expected format of files created by the resource monitor.*/
#define RESOURCE_MONITOR_TASK_LOCAL_NAME "vine-task-%d"
#define RESOURCE_MONITOR_REMOTE_NAME "cctools-monitor"
Expand Down