diff --git a/dttools/src/.gitignore b/dttools/src/.gitignore index 019e34fc65..1af0628763 100644 --- a/dttools/src/.gitignore +++ b/dttools/src/.gitignore @@ -41,4 +41,5 @@ bucketing_manager_test hash_table_fromkey_test hash_table_offset_test hash_table_benchmark -priority_queue_test \ No newline at end of file +priority_queue_test +progress_bar_test \ No newline at end of file diff --git a/dttools/src/Makefile b/dttools/src/Makefile index 829e42ef5c..c5103861b0 100644 --- a/dttools/src/Makefile +++ b/dttools/src/Makefile @@ -94,6 +94,8 @@ SOURCES = \ priority_queue.c \ priority_queue_test.c \ process.c \ + progress_bar.c \ + progress_bar_test.c \ random.c \ rmonitor.c \ rmonitor_poll.c \ @@ -164,6 +166,7 @@ HEADERS_PUBLIC = \ macros.h \ path.h \ priority_queue.h \ + progress_bar.h \ rmonitor_poll.h \ rmsummary.h \ stringtools.h \ @@ -193,7 +196,7 @@ PROGRAMS = $(MOST_PROGRAMS) catalog_query SCRIPTS = cctools_gpu_autodetect TARGETS = $(LIBRARIES) $(PRELOAD_LIBRARIES) $(PROGRAMS) $(TEST_PROGRAMS) -TEST_PROGRAMS = auth_test disk_alloc_test jx_test microbench multirun jx_count_obj_test jx_canonicalize_test jx_merge_test hash_table_offset_test hash_table_fromkey_test hash_table_benchmark histogram_test category_test jx_binary_test bucketing_base_test bucketing_manager_test priority_queue_test +TEST_PROGRAMS = auth_test disk_alloc_test jx_test microbench multirun jx_count_obj_test jx_canonicalize_test jx_merge_test hash_table_offset_test hash_table_fromkey_test hash_table_benchmark histogram_test category_test jx_binary_test bucketing_base_test bucketing_manager_test priority_queue_test progress_bar_test all: $(TARGETS) catalog_query diff --git a/dttools/src/progress_bar.c b/dttools/src/progress_bar.c new file mode 100644 index 0000000000..012d80e003 --- /dev/null +++ b/dttools/src/progress_bar.c @@ -0,0 +1,295 @@ +/* +Copyright (C) 2025 The University of Notre Dame +This software is distributed under the GNU General Public License. +See the file COPYING for details. +*/ + +/** @file progress_bar.c +Implementation of a terminal progress bar with multiple parts. +*/ + +#include "progress_bar.h" +#include "xxmalloc.h" +#include "macros.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Max bar width (in block characters) for single-line rendering. */ +#define MAX_BAR_WIDTH 30 +/* Typed time constants (microseconds). */ +static const timestamp_t SECOND_US = 1000000ULL; +static const timestamp_t MILLISECOND_US = 1000ULL; +static const timestamp_t MICROSECOND_US = 1ULL; + +/* Minimum redraw interval to avoid flicker (200ms). */ +#define PROGRESS_BAR_UPDATE_INTERVAL_US (SECOND_US / 5) + +#define COLOR_RESET "\033[0m" +#define COLOR_GREEN "\033[32m" +#define COLOR_CYAN "\033[38;2;0;255;255m" +#define COLOR_ORANGE "\033[38;2;255;165;0m" +#define COLOR_PURPLE "\033[38;2;128;0;128m" +#define COLOR_PINK "\033[38;2;255;192;203m" +#define COLOR_YELLOW "\033[38;2;255;255;0m" + +/** Get terminal width in columns; return 80 on failure. */ +static int get_terminal_width() +{ + struct winsize w; + + if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &w) == -1) { + return 80; + } + + return w.ws_col; +} + +/** Compute bar width based on terminal and labels; clamp to bounds. */ +static int compute_bar_width(const char *label, int part_text_len) +{ + if (!label) { + return 0; + } + + int term_width = get_terminal_width(); + int label_len = strlen(label); + int bar_width = term_width - label_len - part_text_len - 28; + + if (bar_width > MAX_BAR_WIDTH) { + bar_width = MAX_BAR_WIDTH; + } + + if (bar_width < 10) { + bar_width = 10; + } + + return (int)(bar_width * 0.8); +} + +/** Render one-line progress bar with aggregated totals, progress, and elapsed time. */ +static void print_progress_bar(struct ProgressBar *bar) +{ + if (!bar) { + return; + } + + bar->last_draw_time_us = timestamp_get(); + + char part_text[256]; + char *ptr = part_text; + int remain = sizeof(part_text); + int written = snprintf(ptr, remain, "["); + ptr += written; + remain -= written; + + uint64_t total_sum = 0; + uint64_t current_sum = 0; + + bool first = true; + struct ProgressBarPart *p; + LIST_ITERATE(bar->parts, p) + { + total_sum += p->total; + current_sum += p->current; + + if (!first) { + written = snprintf(ptr, remain, ", "); + ptr += written; + remain -= written; + } + + written = snprintf(ptr, remain, "%s: %" PRIu64 "/%" PRIu64, p->label, p->current, p->total); + ptr += written; + remain -= written; + + first = false; + } + snprintf(ptr, remain, "]"); + part_text[sizeof(part_text) - 1] = '\0'; + + float progress = (total_sum > 0) ? ((float)current_sum / total_sum) : 0.0f; + if (progress > 1.0f) { + progress = 1.0f; + } + + timestamp_t elapsed = timestamp_get() - bar->start_time_us; + int h = elapsed / (3600LL * SECOND_US); + int m = (elapsed % (3600LL * SECOND_US)) / (60LL * SECOND_US); + int s = (elapsed % (60LL * SECOND_US)) / SECOND_US; + + if (bar->has_drawn_once) { + printf("\r\033[2K"); + } else { + bar->has_drawn_once = 1; + } + + int part_text_len = (int)(ptr - part_text) + 1; + int bar_width = compute_bar_width(bar->label, part_text_len); + int filled = (int)(progress * bar_width); + + char bar_line[MAX_BAR_WIDTH * 3 + 1]; + int offset = 0; + const char *block = "━"; + + for (int i = 0; i < filled; ++i) { + memcpy(bar_line + offset, block, 3); + offset += 3; + } + + memset(bar_line + offset, ' ', (bar_width - filled)); + offset += (bar_width - filled); + bar_line[offset] = '\0'; + + printf("%s " COLOR_GREEN "%s %" PRIu64 "/%" PRIu64 COLOR_YELLOW " %s" COLOR_CYAN " %.1f%%" COLOR_ORANGE " %02d:%02d:%02d" COLOR_RESET, + bar->label ? bar->label : "", + bar_line, + current_sum, + total_sum, + part_text, + progress * 100, + h, + m, + s); + + fflush(stdout); +} + +/** Create and initialize a progress bar. */ +struct ProgressBar *progress_bar_init(const char *label) +{ + if (!label) { + return NULL; + } + + struct ProgressBar *bar = xxmalloc(sizeof(struct ProgressBar)); + + bar->label = xxstrdup(label); + bar->parts = list_create(); + bar->start_time_us = timestamp_get(); + bar->last_draw_time_us = 0; + bar->update_interval_us = PROGRESS_BAR_UPDATE_INTERVAL_US; + bar->update_interval_sec = (double)bar->update_interval_us / SECOND_US; + bar->has_drawn_once = 0; + + return bar; +} + +/** Set the update interval for the progress bar. */ +void progress_bar_set_update_interval(struct ProgressBar *bar, double update_interval_sec) +{ + if (!bar) { + return; + } + + if (update_interval_sec < 0) { + update_interval_sec = 0; + } + bar->update_interval_sec = update_interval_sec; + /* Convert seconds to microseconds with saturation to avoid overflow. */ + if (update_interval_sec >= (double)UINT64_MAX / (double)SECOND_US) { + bar->update_interval_us = (timestamp_t)UINT64_MAX; + } else { + bar->update_interval_us = (timestamp_t)(update_interval_sec * (double)SECOND_US); + } +} + +/** Create a new part. */ +struct ProgressBarPart *progress_bar_create_part(const char *label, uint64_t total) +{ + if (!label) { + return NULL; + } + + struct ProgressBarPart *part = xxmalloc(sizeof(struct ProgressBarPart)); + + part->label = xxstrdup(label); + part->total = total; + part->current = 0; + + return part; +} + +/** Bind a part to the progress bar. */ +void progress_bar_bind_part(struct ProgressBar *bar, struct ProgressBarPart *part) +{ + if (!bar || !part) { + return; + } + + list_push_tail(bar->parts, part); + print_progress_bar(bar); +} + +/** Set the total for a part. */ +void progress_bar_set_part_total(struct ProgressBar *bar, struct ProgressBarPart *part, uint64_t new_total) +{ + if (!bar || !part) { + return; + } + + part->total = new_total; +} + +/** Advance a part's current value, redraw if needed. */ +void progress_bar_update_part(struct ProgressBar *bar, struct ProgressBarPart *part, uint64_t increment) +{ + if (!bar || !part) { + return; + } + + part->current += increment; + if (part->current > part->total) { + part->current = part->total; + } + + timestamp_t now_us = timestamp_get(); + if (!bar->has_drawn_once || (now_us - bar->last_draw_time_us) >= bar->update_interval_us) { + print_progress_bar(bar); + } +} + +/** Set the start time for the progress bar. */ +void progress_bar_set_start_time(struct ProgressBar *bar, timestamp_t start_time) +{ + if (!bar) { + return; + } + + bar->start_time_us = start_time; +} + +/** Final render and newline. */ +void progress_bar_finish(struct ProgressBar *bar) +{ + if (!bar) { + return; + } + + print_progress_bar(bar); + printf("\n"); +} + +/** Free the progress bar, its parts, and internal resources. */ +void progress_bar_delete(struct ProgressBar *bar) +{ + if (!bar) { + return; + } + + free(bar->label); + struct ProgressBarPart *p; + LIST_ITERATE(bar->parts, p) + { + free(p->label); + free(p); + } + list_delete(bar->parts); + free(bar); +} diff --git a/dttools/src/progress_bar.h b/dttools/src/progress_bar.h new file mode 100644 index 0000000000..7fe3171a3f --- /dev/null +++ b/dttools/src/progress_bar.h @@ -0,0 +1,96 @@ +/* +Copyright (C) 2025 The University of Notre Dame +This software is distributed under the GNU General Public License. +See the file COPYING for details. +*/ + +/** @file progress_bar.h +Terminal progress bar API with multiple parts. +*/ + +#ifndef PROGRESS_BAR_H +#define PROGRESS_BAR_H + +#include "list.h" +#include "timestamp.h" +#include +#include + +/** A part of a progress bar. */ +struct ProgressBarPart { + char *label; + uint64_t total; + uint64_t current; +}; + +/** Progress bar object. */ +struct ProgressBar { + /* User-facing interval in seconds; internal comparisons use *_us. */ + double update_interval_sec; + char *label; + struct list *parts; + /* Timestamps in microseconds. */ + timestamp_t start_time_us; + timestamp_t last_draw_time_us; + timestamp_t update_interval_us; + int has_drawn_once; +}; + +/* Progress Bar Part API */ + +/** Create a progress bar. +@param label Progress bar label (internally duplicated). +@return New progress bar. +*/ +struct ProgressBar *progress_bar_init(const char *label); + +/** Set the update interval for the progress bar. +@param bar Progress bar. +@param update_interval_sec Update interval in seconds. +*/ +void progress_bar_set_update_interval(struct ProgressBar *bar, double update_interval_sec); + +/** Create a new part. +@param label Part label (internally duplicated). +@param total Total units for the part. +@return New part. +*/ +struct ProgressBarPart *progress_bar_create_part(const char *label, uint64_t total); + +/** Bind a part to the progress bar. +@param bar Progress bar. +@param part Part to bind. +*/ +void progress_bar_bind_part(struct ProgressBar *bar, struct ProgressBarPart *part); + +/** Set the total for a part. +@param bar Progress bar. +@param part Part to update. +@param new_total New total units. +*/ +void progress_bar_set_part_total(struct ProgressBar *bar, struct ProgressBarPart *part, uint64_t new_total); + +/** Update the current value for a part, redraw if needed. +@param bar Progress bar. +@param part Part to advance. +@param increment Amount to add. +*/ +void progress_bar_update_part(struct ProgressBar *bar, struct ProgressBarPart *part, uint64_t increment); + +/** Set the start time for the progress bar. +@param bar Progress bar. +@param start_time Start timestamp. +*/ +void progress_bar_set_start_time(struct ProgressBar *bar, timestamp_t start_time); + +/** Finish the progress bar: draw once and print a newline. +@param bar Progress bar. +*/ +void progress_bar_finish(struct ProgressBar *bar); + +/** Delete the progress bar and free all parts. +@param bar Progress bar. +*/ +void progress_bar_delete(struct ProgressBar *bar); + +#endif diff --git a/dttools/src/progress_bar_test.c b/dttools/src/progress_bar_test.c new file mode 100644 index 0000000000..4a7b70ed1c --- /dev/null +++ b/dttools/src/progress_bar_test.c @@ -0,0 +1,35 @@ +#include "progress_bar.h" +#include "list.h" +#include "timestamp.h" +#include +#include + +int main() +{ + uint64_t total = 1000000; + struct ProgressBarPart *part1 = progress_bar_create_part("step", total); + struct ProgressBarPart *part2 = progress_bar_create_part("fetch", total); + struct ProgressBarPart *part3 = progress_bar_create_part("commit", total); + + struct ProgressBar *bar = progress_bar_init("Compute"); + progress_bar_set_update_interval(bar, 0.5); + + progress_bar_bind_part(bar, part1); + progress_bar_bind_part(bar, part2); + progress_bar_bind_part(bar, part3); + + timestamp_t start_time = timestamp_get(); + for (uint64_t i = 0; i < total; i++) { + progress_bar_update_part(bar, part1, 1); + progress_bar_update_part(bar, part2, 1); + progress_bar_update_part(bar, part3, 1); + } + + progress_bar_finish(bar); + progress_bar_delete(bar); + + timestamp_t end_time = timestamp_get(); + printf("time taken: %" PRIu64 "\n", end_time - start_time); + + return 0; +} diff --git a/poncho/src/poncho/library_network_code.py b/poncho/src/poncho/library_network_code.py index db6127cc71..cbc88209ef 100755 --- a/poncho/src/poncho/library_network_code.py +++ b/poncho/src/poncho/library_network_code.py @@ -28,6 +28,9 @@ r, w = os.pipe() exec_method = None +# infile load mode for function tasks inside this library +function_infile_load_mode = None + # This class captures how results from FunctionCalls are conveyed from # the library to the manager. @@ -81,7 +84,22 @@ def remote_wrapper(event): # Handler to sigchld when child exits. def sigchld_handler(signum, frame): # write any byte to signal that there's at least 1 child - os.writev(w, [b"a"]) + try: + os.write(w, b"a") + except OSError: + pass + + +# Load the infile for a function task inside this library +def load_function_infile(in_file_path): + if function_infile_load_mode == "cloudpickle": + with open(in_file_path, "rb") as f: + return cloudpickle.load(f) + elif function_infile_load_mode == "json": + with open(in_file_path, "r", encoding="utf-8") as f: + return json.load(f) + else: + raise ValueError(f"invalid infile load mode: {function_infile_load_mode}") # Read data from worker, start function, and dump result to `outfile`. @@ -130,8 +148,7 @@ def start_function(in_pipe_fd, thread_limit=1): os.chdir(function_sandbox) # parameters are represented as infile. - with open("infile", "rb") as f: - event = cloudpickle.load(f) + event = load_function_infile("infile") # output of execution should be dumped to outfile. result = globals()[function_name](event) @@ -150,24 +167,22 @@ def start_function(in_pipe_fd, thread_limit=1): raise except Exception as e: - stdout_timed_message( - f"Library code: Function call failed due to {e}", - file=sys.stderr, - ) + stdout_timed_message(f"Library code: Function call failed due to {e}") sys.exit(1) finally: os.chdir(library_sandbox) return -1, function_id else: try: - arg_infile = os.path.join(function_sandbox, "infile") - with open(arg_infile, "rb") as f: - event = cloudpickle.load(f) + infile_path = os.path.join(function_sandbox, "infile") + event = load_function_infile(infile_path) except Exception: - stdout_timed_message(f"TASK {function_id} error: can't load the arguments from {arg_infile}") - return + stdout_timed_message(f"TASK {function_id} error: can't load the arguments from {infile_path}") + return -1, function_id p = os.fork() if p == 0: + exit_status = 1 + try: # change the working directory to the function's sandbox os.chdir(function_sandbox) @@ -175,49 +190,33 @@ def start_function(in_pipe_fd, thread_limit=1): stdout_timed_message(f"TASK {function_id} {function_name} arrives, starting to run in process {os.getpid()}") try: - exit_status = 1 - except Exception: - stdout_timed_message(f"TASK {function_id} error: can't load the arguments from infile") - exit_status = 2 - raise - - try: - # setup stdout/err for a function call so we can capture them. - function_stdout_fd = os.open( - function_stdout_filename, os.O_WRONLY | os.O_CREAT | os.O_TRUNC - ) - # store the library's stdout fd - library_fd = os.dup(sys.stdout.fileno()) + # each child process independently redirects its own stdout/stderr. + with open(function_stdout_filename, "wb", buffering=0) as f: + os.dup2(f.fileno(), 1) # redirect stdout + os.dup2(f.fileno(), 2) # redirect stderr - # only redirect the stdout of a specific FunctionCall task into its own stdout fd, - # otherwise use the library's stdout - os.dup2(function_stdout_fd, sys.stdout.fileno()) - os.dup2(function_stdout_fd, sys.stderr.fileno()) - result = globals()[function_name](event) + stdout_timed_message(f"TASK {function_id} {function_name} starts in PID {os.getpid()}") + result = globals()[function_name](event) + stdout_timed_message(f"TASK {function_id} {function_name} finished") - # restore to the library's stdout fd on completion - os.dup2(library_fd, sys.stdout.fileno()) except Exception: - stdout_timed_message(f"TASK {function_id} error: can't execute this function") - exit_status = 3 + stdout_timed_message(f"TASK {function_id} error: can't execute {function_name} due to {traceback.format_exc()}") + exit_status = 2 raise - finally: - if function_stdout_fd in locals(): - os.close(function_stdout_fd) try: with open("outfile", "wb") as f: cloudpickle.dump(result, f) except Exception: stdout_timed_message(f"TASK {function_id} error: can't load the result from outfile") - exit_status = 4 - if os.path.exits("outfile"): + exit_status = 3 + if os.path.exists("outfile"): os.remove("outfile") raise try: if not result["Success"]: - exit_status = 5 + exit_status = 4 except Exception: stdout_timed_message(f"TASK {function_id} error: the result is invalid") exit_status = 5 @@ -232,14 +231,12 @@ def start_function(in_pipe_fd, thread_limit=1): os._exit(exit_status) elif p < 0: stdout_timed_message(f"TASK {function_id} error: unable to fork to execute {function_name}") - return -1 + return -1, function_id # return pid and function id of child process to parent. else: return p, function_id - return -1 - # Send result of a function execution to worker. Wake worker up to do work with SIGCHLD. def send_result(out_pipe_fd, worker_pid, task_id, exit_code): @@ -382,11 +379,16 @@ def main(): global exec_method exec_method = library_info['exec_mode'] + # set infile load mode of functions in this library + global function_infile_load_mode + function_infile_load_mode = library_info['function_infile_load_mode'] + # send configuration of library, just its name for now config = { "name": library_info['library_name'], "taskid": args.task_id, "exec_mode": exec_method, + "function_infile_load_mode": function_infile_load_mode, } send_configuration(config, out_pipe_fd, args.worker_pid) @@ -431,7 +433,15 @@ def main(): ) else: pid, func_id = start_function(in_pipe_fd, thread_limit) - pid_to_func_id[pid] = func_id + if pid == -1: + send_result( + out_pipe_fd, + args.worker_pid, + func_id, + 1, + ) + else: + pid_to_func_id[pid] = func_id else: # at least 1 child exits, reap all. # read only once as os.read is blocking if there's nothing to read. diff --git a/poncho/src/poncho/package_serverize.py b/poncho/src/poncho/package_serverize.py index 4a6e5e7a29..cfc789a11b 100755 --- a/poncho/src/poncho/package_serverize.py +++ b/poncho/src/poncho/package_serverize.py @@ -178,6 +178,7 @@ def pack_library_code(path, envpath): # @param exec_mode The execution mode of functions in this library. # @param hoisting_modules A list of modules imported at the preamble of library, including packages, functions and classes. # @param library_context_info A list containing [library_context_func, library_context_args, library_context_kwargs]. Used to create the library context on remote nodes. +# @param function_infile_load_mode The mode to load infile for function tasks inside this library. # @return A hash value. def generate_library_hash(library_name, function_list, @@ -186,7 +187,8 @@ def generate_library_hash(library_name, add_env, exec_mode, hoisting_modules, - library_context_info): + library_context_info, + function_infile_load_mode): library_info = [library_name] function_list = list(function_list) function_names = set() @@ -234,6 +236,8 @@ def generate_library_hash(library_name, for kwarg in library_context_info[2]: library_info.append(str(kwarg)) library_info.append(str(library_context_info[2][kwarg])) + + library_info.append(str(function_infile_load_mode)) library_info = ''.join(library_info) # linear time complexity msg = hashlib.sha1() @@ -293,6 +297,7 @@ def generate_taskvine_library_code(library_path, hoisting_modules=None): # @param exec_mode execution mode of functions in this library # @param hoisting_modules a list of modules to be imported at the preamble of library # @param library_context_info a list containing a library's context to be created remotely +# @param function_infile_load_mode The mode to load infile for function tasks inside this library. # @return name of the file containing serialized information about the library def generate_library(library_cache_path, library_code_path, @@ -303,7 +308,8 @@ def generate_library(library_cache_path, need_pack=True, exec_mode='fork', hoisting_modules=None, - library_context_info=None + library_context_info=None, + function_infile_load_mode='cloudpickle' ): # create library_info.clpk library_info = {} @@ -313,6 +319,7 @@ def generate_library(library_cache_path, library_info['library_name'] = library_name library_info['exec_mode'] = exec_mode library_info['context_info'] = cloudpickle.dumps(library_context_info) + library_info['function_infile_load_mode'] = function_infile_load_mode with open(library_info_path, 'wb') as f: cloudpickle.dump(library_info, f) diff --git a/taskvine/src/Makefile b/taskvine/src/Makefile index 8f828fd7bf..1bd3ef4f43 100644 --- a/taskvine/src/Makefile +++ b/taskvine/src/Makefile @@ -1,12 +1,13 @@ include ../../config.mk include ../../rules.mk -TARGETS=manager worker tools bindings examples +TARGETS=manager worker tools bindings examples graph all: $(TARGETS) worker: manager -bindings: manager +graph: manager +bindings: manager graph tools: manager examples: manager worker tools bindings diff --git a/taskvine/src/bindings/python3/Makefile b/taskvine/src/bindings/python3/Makefile index ca4ca6a52b..905b2703ca 100644 --- a/taskvine/src/bindings/python3/Makefile +++ b/taskvine/src/bindings/python3/Makefile @@ -34,4 +34,4 @@ install: all mkdir -p $(CCTOOLS_PYTHON3_PATH)/ndcctools/taskvine/compat cp ndcctools/taskvine/*.py $(DSPYTHONSO) $(CCTOOLS_PYTHON3_PATH)/ndcctools/taskvine cp ndcctools/taskvine/compat/*.py $(CCTOOLS_PYTHON3_PATH)/ndcctools/taskvine/compat - cp taskvine.py $(CCTOOLS_PYTHON3_PATH)/ + cp taskvine.py $(CCTOOLS_PYTHON3_PATH)/ \ No newline at end of file diff --git a/taskvine/src/bindings/python3/ndcctools/taskvine/manager.py b/taskvine/src/bindings/python3/ndcctools/taskvine/manager.py index b6767a6b56..55be0e835d 100644 --- a/taskvine/src/bindings/python3/ndcctools/taskvine/manager.py +++ b/taskvine/src/bindings/python3/ndcctools/taskvine/manager.py @@ -936,8 +936,9 @@ def check_library_exists(self, library_name): # @param hoisting_modules A list of modules imported at the preamble of library, including packages, functions and classes. # @param exec_mode Execution mode that the library should use to run function calls. Either 'direct' or 'fork' # @param library_context_info A list containing [library_context_func, library_context_args, library_context_kwargs]. Used to create the library context on remote nodes. + # @param function_infile_load_mode The mode to load infile for function tasks inside this library. # @returns A task to be used with @ref ndcctools.taskvine.manager.Manager.install_library. - def create_library_from_functions(self, library_name, *function_list, poncho_env=None, init_command=None, add_env=True, hoisting_modules=None, exec_mode='fork', library_context_info=None): + def create_library_from_functions(self, library_name, *function_list, poncho_env=None, init_command=None, add_env=True, hoisting_modules=None, exec_mode='fork', library_context_info=None, function_infile_load_mode='cloudpickle'): # Delay loading of poncho until here, to avoid bringing in poncho dependencies unless needed. # Ensure poncho python library is available. from ndcctools.poncho import package_serverize @@ -959,7 +960,8 @@ def create_library_from_functions(self, library_name, *function_list, poncho_env add_env=add_env, exec_mode=exec_mode, hoisting_modules=hoisting_modules, - library_context_info=library_context_info) + library_context_info=library_context_info, + function_infile_load_mode=function_infile_load_mode) # Create path for caching library code and environment based on function hash. library_cache_dir_name = "vine-library-cache" @@ -1007,7 +1009,8 @@ def create_library_from_functions(self, library_name, *function_list, poncho_env need_pack=need_pack, exec_mode=exec_mode, hoisting_modules=hoisting_modules, - library_context_info=library_context_info) + library_context_info=library_context_info, + function_infile_load_mode=function_infile_load_mode) # enable correct permissions for library code os.chmod(library_code_path, 0o775) diff --git a/taskvine/src/bindings/python3/taskvine.i b/taskvine/src/bindings/python3/taskvine.i index ba4e66cb74..1d875f97be 100644 --- a/taskvine/src/bindings/python3/taskvine.i +++ b/taskvine/src/bindings/python3/taskvine.i @@ -1,5 +1,5 @@ /* taskvine.i */ -%module cvine +%module(package="ndcctools.taskvine") cvine %include carrays.i %array_functions(struct rmsummary *, rmsummayArray); diff --git a/taskvine/src/graph/Makefile b/taskvine/src/graph/Makefile new file mode 100644 index 0000000000..f961ec9e86 --- /dev/null +++ b/taskvine/src/graph/Makefile @@ -0,0 +1,11 @@ +include ../../../config.mk +include ../../../rules.mk + +SUBDIRS = vinedag + +all clean install test lint format: $(SUBDIRS) + +$(SUBDIRS): %: + $(MAKE) -C $@ $(MAKECMDGOALS) + +.PHONY: all clean install test lint format $(SUBDIRS) diff --git a/taskvine/src/graph/vinedag/.gitignore b/taskvine/src/graph/vinedag/.gitignore new file mode 100644 index 0000000000..38280b8491 --- /dev/null +++ b/taskvine/src/graph/vinedag/.gitignore @@ -0,0 +1,6 @@ +*.a +*.so +*.o +*_wrap.c +*_wrap.0 +build/ \ No newline at end of file diff --git a/taskvine/src/graph/vinedag/.gitkeep b/taskvine/src/graph/vinedag/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/taskvine/src/graph/vinedag/Makefile b/taskvine/src/graph/vinedag/Makefile new file mode 100644 index 0000000000..32dff04894 --- /dev/null +++ b/taskvine/src/graph/vinedag/Makefile @@ -0,0 +1,36 @@ +include ../../../../config.mk +include ../../../../rules.mk + +MODULE_DIR := $(CCTOOLS_PYTHON3_PATH)/ndcctools/taskvine/vinedag +SUBDIRS := context_graph vine_graph + +.PHONY: all install clean lint format $(SUBDIRS) + +all: $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ all + +install: all + mkdir -p $(MODULE_DIR) + cp vinedag.py $(MODULE_DIR) + cp __init__.py $(MODULE_DIR) + @for dir in $(SUBDIRS); do \ + $(MAKE) -C $$dir install; \ + done + +clean: + @for dir in $(SUBDIRS); do \ + $(MAKE) -C $$dir clean; \ + done + rm -rf build + +lint: + @for dir in $(SUBDIRS); do \ + $(MAKE) -C $$dir lint; \ + done + +format: + @for dir in $(SUBDIRS); do \ + $(MAKE) -C $$dir format; \ + done diff --git a/taskvine/src/graph/vinedag/__init__.py b/taskvine/src/graph/vinedag/__init__.py new file mode 100644 index 0000000000..0a3da3f715 --- /dev/null +++ b/taskvine/src/graph/vinedag/__init__.py @@ -0,0 +1,7 @@ +# Copyright (C) 2025- The University of Notre Dame +# This software is distributed under the GNU General Public License. +# See the file COPYING for details. + +from .vinedag import VineDAG + +__all__ = ["VineDAG"] diff --git a/taskvine/src/graph/vinedag/context_graph/Makefile b/taskvine/src/graph/vinedag/context_graph/Makefile new file mode 100644 index 0000000000..bdee79013e --- /dev/null +++ b/taskvine/src/graph/vinedag/context_graph/Makefile @@ -0,0 +1,31 @@ +include ../../../../../config.mk +include ../../../../../rules.mk + +PROJECT_NAME = vinedag + +SOURCE_DIR = $(CCTOOLS_HOME)/taskvine/src/graph/$(PROJECT_NAME)/context_graph +MODULE_ROOT = $(CCTOOLS_PYTHON3_PATH)/ndcctools/taskvine/$(PROJECT_NAME) +MODULE_DIR = $(MODULE_ROOT)/context_graph + +PY_SOURCES = $(wildcard $(SOURCE_DIR)/*.py) + +.PHONY: all install clean lint format + +all: + @true + +install: + mkdir -p $(MODULE_DIR) + cp $(PY_SOURCES) $(MODULE_DIR) + +clean: + rm -rf __pycache__ + +lint: + flake8 --ignore=$(CCTOOLS_FLAKE8_IGNORE_ERRORS) \ + --exclude=$(CCTOOLS_FLAKE8_IGNORE_FILES) \ + $(SOURCE_DIR)/ + +format: + @true + diff --git a/taskvine/src/graph/vinedag/context_graph/__init__.py b/taskvine/src/graph/vinedag/context_graph/__init__.py new file mode 100644 index 0000000000..7d8b678cc5 --- /dev/null +++ b/taskvine/src/graph/vinedag/context_graph/__init__.py @@ -0,0 +1,18 @@ +# Copyright (C) 2025 The University of Notre Dame +# This software is distributed under the GNU General Public License. +# See the file COPYING for details. + + +from .core import ContextGraph, ContextGraphTaskResult +from .proxy_functions import compute_single_key, compute_dts_key, compute_sexpr_key +from .proxy_library import ProxyLibrary + + +__all__ = [ + "ContextGraph", + "ContextGraphTaskResult", + "compute_single_key", + "compute_dts_key", + "compute_sexpr_key", + "ProxyLibrary", +] diff --git a/taskvine/src/graph/vinedag/context_graph/core.py b/taskvine/src/graph/vinedag/context_graph/core.py new file mode 100644 index 0000000000..61bbeb0d30 --- /dev/null +++ b/taskvine/src/graph/vinedag/context_graph/core.py @@ -0,0 +1,195 @@ +# Copyright (C) 2025 The University of Notre Dame +# This software is distributed under the GNU General Public License. +# See the file COPYING for details. + +import cloudpickle +import collections +import random +from collections import deque + +# Attempt to import Dask helpers. When they are unavailable we fall back to +# None so environments without Dask continue to work. If Dask is present—either +# the legacy graph or the newer TaskSpec API—we normalize it into our internal +# task representation. +try: + import dask +except ImportError: + dask = None +try: + import dask._task_spec as dts +except ImportError: + dts = None + + +def hashable(s): + """Used while wiring dependencies to spot values we can treat as node keys.""" + try: + hash(s) + return True + except TypeError: + return False + + +# Lightweight wrapper around task results that optionally pads the payload. The +# padding lets tests model large outputs without altering the logical result. +class ContextGraphTaskResult: + def __init__(self, result, extra_size_mb=None): + """Store the real user result plus optional padding used during regression tests.""" + self.result = result + self.extra_obj = bytearray(int(extra_size_mb * 1024 * 1024)) if extra_size_mb and extra_size_mb > 0 else None + + @staticmethod + def load_from_path(path): + """Workers call this while recovering an output produced by save_result_of_key from disk. + If a node-local output, then data is stored in the task sandbox and the path is just the filename + If a shared file system output, then path is the full path to the file + If a target result, the path is the full path to the file in the manager's output directory""" + try: + with open(path, "rb") as f: + result_obj = cloudpickle.load(f) + assert isinstance(result_obj, ContextGraphTaskResult), "Loaded object is not of type ContextGraphTaskResult" + return result_obj.result + except FileNotFoundError: + raise FileNotFoundError(f"Output file not found at {path}") + + +# ContextGraph builds the logical DAG and manages dependencies. The +# object is cloudpickled, shipped with the proxy library, and hoisted on worker +# nodes. When a task key executes we map from the Vine key back to the original +# graph key, run the user function, and persist the result. +class ContextGraph: + def __init__(self, task_dict, + extra_task_output_size_mb=[0, 0], + extra_task_sleep_time=[0, 0]): + """Capture the Python DAG that VineDAG hands us before we mirror it in C.""" + self.task_dict = task_dict + + if dts: + for k, v in self.task_dict.items(): + if isinstance(v, dts.GraphNode): + assert isinstance(v, (dts.Alias, dts.Task, dts.DataNode)), f"Unsupported task type for key {k}: {v.__class__}" + + self.parents_of, self.children_of = self._build_dependencies(self.task_dict) + + # these mappings are set after node ids are assigned in the C vine graph + self.ckey2vid = {} + self.vid2ckey = {} + + # will be set from vine graph + self.outfile_remote_name = {key: None for key in self.task_dict.keys()} + + # testing params + self.extra_task_output_size_mb = self._calculate_extra_size_mb_of(extra_task_output_size_mb) + self.extra_sleep_time_of = self._calculate_extra_sleep_time_of(extra_task_sleep_time) + + def _calculate_extra_size_mb_of(self, extra_task_output_size_mb): + """Sample a uniform byte budget between low/high for every node.""" + assert isinstance(extra_task_output_size_mb, list) and len(extra_task_output_size_mb) == 2 + low, high = extra_task_output_size_mb + low, high = int(low), int(high) + assert low <= high + + return {k: random.uniform(low, high) for k in self.task_dict.keys()} + + def _calculate_extra_sleep_time_of(self, extra_task_sleep_time): + """Pick a uniform delay between low/high so tests can fake runtime.""" + assert isinstance(extra_task_sleep_time, list) and len(extra_task_sleep_time) == 2 + low, high = extra_task_sleep_time + low, high = int(low), int(high) + assert low <= high + + return {k: random.uniform(low, high) for k in self.task_dict.keys()} + + def is_dts_key(self, k): + """Gate the Dask-specific branch when we parse task definitions.""" + if not hasattr(dask, "_task_spec"): + return False + import dask._task_spec as dts + return isinstance(self.task_dict[k], (dts.Task, dts.TaskRef, dts.Alias, dts.DataNode, dts.NestedContainer)) + + def _build_dependencies(self, task_dict): + """Normalize mixed Dask/s-expression inputs into our parent/child lookup tables.""" + def _find_sexpr_parents(sexpr): + """Resolve the immediate parents inside one symbolic expression node.""" + if hashable(sexpr) and sexpr in task_dict.keys(): + return {sexpr} + elif isinstance(sexpr, (list, tuple)): + deps = set() + for x in sexpr: + deps |= _find_sexpr_parents(x) + return deps + elif isinstance(sexpr, dict): + deps = set() + for k, v in sexpr.items(): + deps |= _find_sexpr_parents(k) + deps |= _find_sexpr_parents(v) + return deps + else: + return set() + + parents_of = collections.defaultdict(set) + children_of = collections.defaultdict(set) + + for k, value in task_dict.items(): + if self.is_dts_key(k): + # in the new Dask expression, each value is an object from dask._task_spec, could be + # a Task, Alias, TaskRef, etc., but they all share the same base class the dependencies + # field is of type frozenset(), without recursive ancestor dependencies involved + parents_of[k] = value.dependencies + else: + # the value could be a sexpr, e.g., the old Dask representation + parents_of[k] = _find_sexpr_parents(value) + + for k, deps in parents_of.items(): + for dep in deps: + children_of[dep].add(k) + + return parents_of, children_of + + def save_result_of_key(self, key, result): + """Called from the proxy function to persist a result into disk after the worker finishes.""" + with open(self.outfile_remote_name[key], "wb") as f: + result_obj = ContextGraphTaskResult(result, extra_size_mb=self.extra_task_output_size_mb[key]) + cloudpickle.dump(result_obj, f) + + def load_result_of_key(self, key): + """Used by downstream tasks to pull inputs from disk or the shared store.""" + # workers user this function to load results from either local or shared file system + # if a node-local output, then data is stored in the task sandbox and the remote name is just the filename + # if a shared file system output, then remote name is the full path to the file + outfile_path = self.outfile_remote_name[key] + return ContextGraphTaskResult.load_from_path(outfile_path) + + def get_topological_order(self): + """Produce the order VineDAG uses when assigning node IDs to the C graph.""" + in_degree = {key: len(self.parents_of[key]) for key in self.task_dict.keys()} + queue = deque([key for key, degree in in_degree.items() if degree == 0]) + topo_order = [] + + while queue: + current = queue.popleft() + topo_order.append(current) + + for child in self.children_of[current]: + in_degree[child] -= 1 + if in_degree[child] == 0: + queue.append(child) + + if len(topo_order) != len(self.task_dict): + print(f"len(topo_order): {len(topo_order)}") + print(f"len(self.task_dict): {len(self.task_dict)}") + raise ValueError("Failed to create topo order, the dependencies may be cyclic or problematic") + + return topo_order + + @staticmethod + def context_loader_func(context_graph_pkl): + """Entry point the proxy library invokes to restore the serialized ContextGraph.""" + context_graph = cloudpickle.loads(context_graph_pkl) + + if not isinstance(context_graph, ContextGraph): + raise TypeError("context_graph_pkl is not of type ContextGraph") + + return { + "context_graph": context_graph, + } diff --git a/taskvine/src/graph/vinedag/context_graph/proxy_functions.py b/taskvine/src/graph/vinedag/context_graph/proxy_functions.py new file mode 100644 index 0000000000..4218756007 --- /dev/null +++ b/taskvine/src/graph/vinedag/context_graph/proxy_functions.py @@ -0,0 +1,111 @@ +# Copyright (C) 2025- The University of Notre Dame +# This software is distributed under the GNU General Public License. +# See the file COPYING for details. + +import os +import time +from ndcctools.taskvine.utils import load_variable_from_library + + +def compute_dts_key(context_graph, k, v): + """ + Compute the result of a Dask task node from dask._task_spec. + + Each value `v` may be an instance of Task, Alias, or DataNode, all of which + inherit from the same base class. The `dependencies` field is a frozenset + containing direct dependencies only (no recursive ancestry). + + The function resolves each dependency from the context_graph, constructs an + input dictionary, and then executes the node according to its type. + """ + try: + import dask._task_spec as dts + except ImportError: + raise ImportError("Dask is not installed") + + input_dict = {dep: context_graph.load_result_of_key(dep) for dep in v.dependencies} + + try: + if isinstance(v, dts.Alias): + assert len(v.dependencies) == 1, "Expected exactly one dependency" + return context_graph.load_result_of_key(next(iter(v.dependencies))) + elif isinstance(v, dts.Task): + return v(input_dict) + elif isinstance(v, dts.DataNode): + return v.value + else: + raise TypeError(f"unexpected node type: {type(v)} for key {k}") + except Exception as e: + raise Exception(f"Error while executing task {k}: {e}") + + +def compute_sexpr_key(context_graph, k, v): + """ + Evaluate a symbolic expression (S-expression) task within the task graph. + + Both VineDAG and legacy Dask represent computations as symbolic + expression trees (S-expressions). Each task value `v` encodes a nested + structure where: + - Leaf nodes are constants or task keys referencing parent results. + - Lists are recursively evaluated. + - Tuples of the form (func, arg1, arg2, ...) represent function calls. + + This function builds an input dictionary from all parent keys, then + recursively resolves and executes the expression until a final value + is produced. + """ + input_dict = {parent: context_graph.load_result_of_key(parent) for parent in context_graph.parents_of[k]} + + def _rec_call(expr): + try: + if expr in input_dict.keys(): + return input_dict[expr] + except TypeError: + pass + if isinstance(expr, list): + return [_rec_call(e) for e in expr] + if isinstance(expr, tuple) and len(expr) > 0 and callable(expr[0]): + res = expr[0](*[_rec_call(a) for a in expr[1:]]) + return res + return expr + + try: + return _rec_call(v) + except Exception as e: + raise Exception(f"Failed to invoke _rec_call(): {e}") + + +def compute_single_key(vine_key): + """ + Compute a single task identified by a Vine key within the current ContextGraph. + + The function retrieves the corresponding graph key and task object from the + global context_graph, determines the task type, and dispatches to the appropriate + execution interface — e.g., `compute_dts_key` for Dask-style task specs or + `compute_sexpr_key` for S-expression graphs. + + This design allows extensibility: for new graph representations, additional + compute interfaces can be introduced and registered here to handle new key types. + + After computation, the result is saved, the output file is validated, and + an optional delay (`extra_sleep_time_of`) is applied before returning. + """ + context_graph = load_variable_from_library('context_graph') + + k = context_graph.vid2ckey[vine_key] + v = context_graph.task_dict[k] + + if context_graph.is_dts_key(k): + result = compute_dts_key(context_graph, k, v) + else: + result = compute_sexpr_key(context_graph, k, v) + + context_graph.save_result_of_key(k, result) + if not os.path.exists(context_graph.outfile_remote_name[k]): + raise Exception(f"Output file {context_graph.outfile_remote_name[k]} does not exist after writing") + if os.stat(context_graph.outfile_remote_name[k]).st_size == 0: + raise Exception(f"Output file {context_graph.outfile_remote_name[k]} is empty after writing") + + time.sleep(context_graph.extra_sleep_time_of[k]) + + return True diff --git a/taskvine/src/graph/vinedag/context_graph/proxy_library.py b/taskvine/src/graph/vinedag/context_graph/proxy_library.py new file mode 100644 index 0000000000..a0c4fb4377 --- /dev/null +++ b/taskvine/src/graph/vinedag/context_graph/proxy_library.py @@ -0,0 +1,94 @@ +# Copyright (C) 2025- The University of Notre Dame +# This software is distributed under the GNU General Public License. +# See the file COPYING for details. + +import os +import uuid +import cloudpickle +import types +import time +import random +import hashlib +import collections + +from ndcctools.taskvine.vinedag.context_graph.core import ContextGraphTaskResult, ContextGraph +from ndcctools.taskvine.vinedag.context_graph.proxy_functions import compute_dts_key, compute_sexpr_key, compute_single_key +from ndcctools.taskvine.utils import load_variable_from_library + + +class ProxyLibrary: + def __init__(self, py_manager): + self.py_manager = py_manager + + self.name = None + self.libcores = None + + self.libtask = None + + # these modules are always included in the preamble of the library task, so that function calls can execute directly + # using the loaded context without importing them over and over again + self.hoisting_modules = [ + os, cloudpickle, ContextGraphTaskResult, ContextGraph, uuid, hashlib, random, types, collections, time, + load_variable_from_library, compute_dts_key, compute_sexpr_key, compute_single_key + ] + + # environment files serve as additional inputs to the library task, where each key is the local path and the value is the remote path + # those local files will be sent remotely to the workers so tasks can access them as appropriate + self.env_files = {} + + # context loader is a function that will be used to load the library context on remote nodes. + self.context_loader_func = None + self.context_loader_args = [] + self.context_loader_kwargs = {} + + self.local_path = None + self.remote_path = None + + def set_libcores(self, libcores): + self.libcores = libcores + + def set_name(self, name): + self.name = name + + def add_hoisting_modules(self, new_modules): + assert isinstance(new_modules, list), "new_modules must be a list of modules" + self.hoisting_modules.extend(new_modules) + + def add_env_files(self, new_env_files): + assert isinstance(new_env_files, dict), "new_env_files must be a dictionary" + self.env_files.update(new_env_files) + + def set_context_loader(self, context_loader_func, context_loader_args=[], context_loader_kwargs={}): + self.context_loader_func = context_loader_func + self.context_loader_args = context_loader_args + self.context_loader_kwargs = context_loader_kwargs + + def get_context_size(self): + dumped_data = self.context_loader_args[0] + serialized = round(len(dumped_data) / 1024 / 1024, 2) + return serialized + + def install(self): + assert self.name is not None, "Library name must be set before installing (use set_name method)" + assert self.libcores is not None, "Library cores must be set before installing (use set_libcores method)" + + self.libtask = self.py_manager.create_library_from_functions( + self.name, + compute_single_key, + library_context_info=[self.context_loader_func, self.context_loader_args, self.context_loader_kwargs], + add_env=False, + function_infile_load_mode="json", + hoisting_modules=self.hoisting_modules, + ) + for local, remote in self.env_files.items(): + # check if the local file exists + if not os.path.exists(local): + raise FileNotFoundError(f"Local file {local} not found") + # attach as the input file to the library task + self.libtask.add_input(self.py_manager.declare_file(local, cache=True, peer_transfer=True), remote) + self.libtask.set_cores(self.libcores) + self.libtask.set_function_slots(self.libcores) + self.py_manager.install_library(self.libtask) + + def uninstall(self): + self.py_manager.remove_library(self.name) diff --git a/taskvine/src/graph/vinedag/vine_graph/.gitignore b/taskvine/src/graph/vinedag/vine_graph/.gitignore new file mode 100644 index 0000000000..15309787ad --- /dev/null +++ b/taskvine/src/graph/vinedag/vine_graph/.gitignore @@ -0,0 +1 @@ +*.o \ No newline at end of file diff --git a/taskvine/src/graph/vinedag/vine_graph/Makefile b/taskvine/src/graph/vinedag/vine_graph/Makefile new file mode 100644 index 0000000000..45fc3f7b42 --- /dev/null +++ b/taskvine/src/graph/vinedag/vine_graph/Makefile @@ -0,0 +1,90 @@ +include ../../../../../config.mk +include ../../../../../rules.mk + +PROJECT_NAME = vinedag + +LOCAL_LINKAGE+=${CCTOOLS_HOME}/taskvine/src/manager/libtaskvine.a ${CCTOOLS_HOME}/dttools/src/libdttools.a +LOCAL_CCFLAGS+=-I ${CCTOOLS_HOME}/taskvine/src/manager +LOCAL_CCFLAGS+=-I ${CCTOOLS_HOME}/taskvine/src/graph/$(PROJECT_NAME)/vine_graph + +SOURCE_DIR = $(CCTOOLS_HOME)/taskvine/src/graph/$(PROJECT_NAME)/vine_graph +MODULE_ROOT = $(CCTOOLS_PYTHON3_PATH)/ndcctools/taskvine/$(PROJECT_NAME) +MODULE_DIR = $(MODULE_ROOT)/vine_graph + +SOURCES = vine_node.c vine_graph.c +OBJECTS = $(SOURCES:%.c=%.o) + +BUILD_DIR := ../build + +# put SWIG generated sources and Python extension artifacts into ../build/ +SWIG_I = vine_graph.i + +WRAP_NAME = vine_graph_wrap +MODULE_NAME = vine_graph_capi + +SWIG_WRAP = $(BUILD_DIR)/$(WRAP_NAME).c +WRAP_OBJ = $(BUILD_DIR)/$(WRAP_NAME).o +PYMODULE = $(BUILD_DIR)/_$(MODULE_NAME).$(CCTOOLS_DYNAMIC_SUFFIX) + +LIBRARIES = +PYDEPS = $(WRAP_OBJ) $(OBJECTS) +PYLINK_INPUT = $(WRAP_OBJ) $(OBJECTS) +PROGRAMS = +SCRIPTS = +TARGETS = $(LIBRARIES) $(PYMODULE) $(PROGRAMS) + +.PHONY: all install clean lint format + +all: $(TARGETS) + +$(PROGRAMS): $(EXTERNALS) + +$(BUILD_DIR): + mkdir -p $(BUILD_DIR) + +$(SWIG_WRAP): $(SWIG_I) vine_graph.h | $(BUILD_DIR) + $(CCTOOLS_SWIG) -python -threads -relativeimport \ + -I$(CCTOOLS_HOME)/taskvine/src/manager \ + -I$(CCTOOLS_HOME)/dttools/src \ + -I$(CCTOOLS_HOME)/taskvine/src/graph/$(PROJECT_NAME) \ + -I$(CCTOOLS_HOME)/taskvine/src/graph/$(PROJECT_NAME)/vine_graph \ + -outdir $(BUILD_DIR) -o $@ $< + +# Build Python module (mimic bindings: silence SWIG warnings and build PIC) +$(WRAP_OBJ): $(SWIG_WRAP) + $(CCTOOLS_CC) -o $@ -c $(CCTOOLS_INTERNAL_CCFLAGS) $(LOCAL_CCFLAGS) $(CCTOOLS_PYTHON3_CCFLAGS) -w -fPIC -DNDEBUG $< + +$(PYMODULE): $(PYDEPS) +ifeq ($(CCTOOLS_STATIC),1) + $(CCTOOLS_LD) -o $@ $(CCTOOLS_DYNAMIC_FLAG) $(CCTOOLS_INTERNAL_LDFLAGS) $(LOCAL_LDFLAGS) $(PYLINK_INPUT) $(LOCAL_LINKAGE) $(CCTOOLS_PYTHON3_LDFLAGS) $(CCTOOLS_EXTERNAL_LINKAGE) +else + $(CCTOOLS_LD) -o $@ $(CCTOOLS_DYNAMIC_FLAG) $(CCTOOLS_INTERNAL_LDFLAGS) $(LOCAL_LDFLAGS) $(PYLINK_INPUT) $(LOCAL_LINKAGE) $(CCTOOLS_PYTHON3_LDFLAGS) $(CCTOOLS_EXTERNAL_LINKAGE) +endif + +install: all + mkdir -p $(CCTOOLS_INSTALL_DIR)/graph/$(PROJECT_NAME)/include + cp ${CCTOOLS_HOME}/taskvine/src/manager/taskvine.h $(CCTOOLS_INSTALL_DIR)/graph/$(PROJECT_NAME)/include/ + mkdir -p $(MODULE_DIR) + cp $(PYMODULE) $(MODULE_DIR) + cp $(BUILD_DIR)/$(MODULE_NAME).py $(MODULE_DIR) + cp $(SOURCE_DIR)/__init__.py $(MODULE_DIR) + cp $(SOURCE_DIR)/vine_graph_client.py $(MODULE_DIR) + +clean: + rm -f $(PROGRAMS) $(OBJECTS) $(WRAP_OBJ) + rm -f $(PYMODULE) $(BUILD_DIR)/$(MODULE_NAME).py + rm -rf $(BUILD_DIR) + +lint: + if ( ! clang-format -Werror --dry-run --style='file:${CCTOOLS_HOME}/.clang-format' $(SOURCE_DIR)/*.c $(SOURCE_DIR)/*.h); \ + then \ + echo "========================================================"; \ + echo "NOTICE: Run \`make format\` to format your latest changes."; \ + echo "========================================================"; \ + exit 1; \ + fi + +format: + clang-format -i $(SOURCE_DIR)/*.c $(SOURCE_DIR)/*.h + + diff --git a/taskvine/src/graph/vinedag/vine_graph/__init__.py b/taskvine/src/graph/vinedag/vine_graph/__init__.py new file mode 100644 index 0000000000..536eed6b88 --- /dev/null +++ b/taskvine/src/graph/vinedag/vine_graph/__init__.py @@ -0,0 +1,10 @@ +# Copyright (C) 2025 The University of Notre Dame +# This software is distributed under the GNU General Public License. +# See the file COPYING for details. + + +from . import vine_graph_capi +from .vine_graph_client import VineGraphClient + + +__all__ = ["vine_graph_capi", "VineGraphClient"] diff --git a/taskvine/src/graph/vinedag/vine_graph/vine_graph.c b/taskvine/src/graph/vinedag/vine_graph/vine_graph.c new file mode 100644 index 0000000000..3ef646c768 --- /dev/null +++ b/taskvine/src/graph/vinedag/vine_graph/vine_graph.c @@ -0,0 +1,1369 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "priority_queue.h" +#include "list.h" +#include "debug.h" +#include "itable.h" +#include "xxmalloc.h" +#include "stringtools.h" +#include "random.h" +#include "hash_table.h" +#include "set.h" +#include "timestamp.h" +#include "progress_bar.h" +#include "macros.h" +#include "uuid.h" + +#include "vine_node.h" +#include "vine_graph.h" +#include "vine_manager.h" +#include "vine_worker_info.h" +#include "vine_task.h" +#include "vine_file.h" +#include "vine_mount.h" +#include "taskvine.h" +#include "vine_temp.h" + +static volatile sig_atomic_t interrupted = 0; + +/*************************************************************/ +/* Private Functions */ +/*************************************************************/ + +/** + * Handle the SIGINT signal. + * @param signal Reference to the signal. + */ +static void handle_sigint(int signal) +{ + interrupted = 1; +} + +/** + * Calculate the priority of a node given the priority mode. + * @param node Reference to the node object. + * @param priority_mode Reference to the priority mode. + * @return The priority. + */ +static double calculate_task_priority(struct vine_node *node, task_priority_mode_t priority_mode) +{ + if (!node) { + return 0; + } + + double priority = 0; + timestamp_t current_time = timestamp_get(); + + struct vine_node *parent_node; + + switch (priority_mode) { + case TASK_PRIORITY_MODE_RANDOM: + priority = random_double(); + break; + case TASK_PRIORITY_MODE_DEPTH_FIRST: + priority = (double)node->depth; + break; + case TASK_PRIORITY_MODE_BREADTH_FIRST: + priority = -(double)node->depth; + break; + case TASK_PRIORITY_MODE_FIFO: + priority = -(double)current_time; + break; + case TASK_PRIORITY_MODE_LIFO: + priority = (double)current_time; + break; + case TASK_PRIORITY_MODE_LARGEST_INPUT_FIRST: + LIST_ITERATE(node->parents, parent_node) + { + if (!parent_node->outfile) { + continue; + } + priority += (double)vine_file_size(parent_node->outfile); + } + break; + case TASK_PRIORITY_MODE_LARGEST_STORAGE_FOOTPRINT_FIRST: + LIST_ITERATE(node->parents, parent_node) + { + if (!parent_node->outfile) { + continue; + } + timestamp_t parent_task_completion_time = parent_node->task->time_workers_execute_last; + priority += (double)vine_file_size(parent_node->outfile) * (double)parent_task_completion_time; + } + break; + } + + return priority; +} + +/** + * Submit a node to the TaskVine manager via the vine graph. + * @param vg Reference to the vine graph. + * @param node Reference to the node. + */ +static void submit_node_task(struct vine_graph *vg, struct vine_node *node) +{ + if (!vg || !node) { + return; + } + + /* calculate the priority of the node */ + double priority = calculate_task_priority(node, vg->task_priority_mode); + vine_task_set_priority(node->task, priority); + + /* submit the task to the manager */ + timestamp_t time_start = timestamp_get(); + int task_id = vine_submit(vg->manager, node->task); + node->submission_time = timestamp_get() - time_start; + + /* insert the task id to the task id to node map */ + itable_insert(vg->task_id_to_node, (uint64_t)task_id, node); + + debug(D_VINE, "submitted node %" PRIu64 " with task id %d", node->node_id, task_id); + + return; +} + +/** + * Submit the children of a node once every dependency has completed. + * @param vg Reference to the vine graph. + * @param node Reference to the node. + */ +static void submit_unblocked_children(struct vine_graph *vg, struct vine_node *node) +{ + if (!vg || !node) { + return; + } + + struct vine_node *child_node; + LIST_ITERATE(node->children, child_node) + { + /* Remove this parent from the child's pending set if it exists */ + if (child_node->pending_parents) { + /* Assert that this parent is indeed pending for the child */ + if (child_node->pending_parents && set_lookup(child_node->pending_parents, node)) { + set_remove(child_node->pending_parents, node); + } else { + debug(D_ERROR, "inconsistent pending set: child=%" PRIu64 " missing parent=%" PRIu64, child_node->node_id, node->node_id); + } + } + + /* If no more parents are pending, submit the child */ + if (!child_node->pending_parents || set_size(child_node->pending_parents) == 0) { + submit_node_task(vg, child_node); + } + } + + return; +} + +/** + * Compute a topological ordering of the vine graph. + * Call only after all nodes, edges, and metrics have been populated. + * @param vg Reference to the vine graph. + * @return Nodes in topological order. + */ +static struct list *get_topological_order(struct vine_graph *vg) +{ + if (!vg) { + return NULL; + } + + int total_nodes = itable_size(vg->nodes); + struct list *topo_order = list_create(); + struct itable *in_degree_map = itable_create(0); + struct priority_queue *pq = priority_queue_create(total_nodes); + + uint64_t nid; + struct vine_node *node; + ITABLE_ITERATE(vg->nodes, nid, node) + { + int deg = list_size(node->parents); + itable_insert(in_degree_map, nid, (void *)(intptr_t)deg); + if (deg == 0) { + priority_queue_push(pq, node, -(double)node->node_id); + } + } + + while (priority_queue_size(pq) > 0) { + struct vine_node *current = priority_queue_pop(pq); + list_push_tail(topo_order, current); + + struct vine_node *child; + LIST_ITERATE(current->children, child) + { + intptr_t raw_deg = (intptr_t)itable_lookup(in_degree_map, child->node_id); + int deg = (int)raw_deg - 1; + itable_insert(in_degree_map, child->node_id, (void *)(intptr_t)deg); + + if (deg == 0) { + priority_queue_push(pq, child, -(double)child->node_id); + } + } + } + + if (list_size(topo_order) != total_nodes) { + debug(D_ERROR, "Error: vine graph contains cycles or is malformed."); + debug(D_ERROR, "Expected %d nodes, but only sorted %d.", total_nodes, list_size(topo_order)); + + uint64_t id; + ITABLE_ITERATE(vg->nodes, id, node) + { + intptr_t raw_deg = (intptr_t)itable_lookup(in_degree_map, id); + int deg = (int)raw_deg; + if (deg > 0) { + debug(D_ERROR, " Node %" PRIu64 " has in-degree %d. Parents:", id, deg); + struct vine_node *p; + LIST_ITERATE(node->parents, p) + { + debug(D_ERROR, " -> %" PRIu64, p->node_id); + } + } + } + + list_delete(topo_order); + itable_delete(in_degree_map); + priority_queue_delete(pq); + exit(1); + } + + itable_delete(in_degree_map); + priority_queue_delete(pq); + return topo_order; +} + +/** + * Extract weakly connected components of the vine graph. + * Currently used for debugging and instrumentation only. + * @param vg Reference to the vine graph. + * @return List of weakly connected components. + */ +static struct list *extract_weakly_connected_components(struct vine_graph *vg) +{ + if (!vg) { + return NULL; + } + + struct set *visited = set_create(0); + struct list *components = list_create(); + + uint64_t nid; + struct vine_node *node; + ITABLE_ITERATE(vg->nodes, nid, node) + { + if (set_lookup(visited, node)) { + continue; + } + + struct list *component = list_create(); + struct list *queue = list_create(); + + list_push_tail(queue, node); + set_insert(visited, node); + list_push_tail(component, node); + + while (list_size(queue) > 0) { + struct vine_node *curr = list_pop_head(queue); + + struct vine_node *p; + LIST_ITERATE(curr->parents, p) + { + if (!set_lookup(visited, p)) { + list_push_tail(queue, p); + set_insert(visited, p); + list_push_tail(component, p); + } + } + + struct vine_node *c; + LIST_ITERATE(curr->children, c) + { + if (!set_lookup(visited, c)) { + list_push_tail(queue, c); + set_insert(visited, c); + list_push_tail(component, c); + } + } + } + + list_push_tail(components, component); + list_delete(queue); + } + + set_delete(visited); + return components; +} + +/** + * Compute the heavy score of a node in the vine graph. + * @param node Reference to the node. + * @return Heavy score. + */ +static double compute_node_heavy_score(struct vine_node *node) +{ + if (!node) { + return 0; + } + + double up_score = node->depth * node->upstream_subgraph_size * node->fan_in; + double down_score = node->height * node->downstream_subgraph_size * node->fan_out; + + return up_score / (down_score + 1); +} + +/** + * Map a TaskVine task back to its vine node. + * @param vg Reference to the vine graph. + * @param task Task reported by the manager. + * @return Matching node. + */ +static struct vine_node *get_node_by_task(struct vine_graph *vg, struct vine_task *task) +{ + if (!vg || !task) { + return NULL; + } + + if (task->type == VINE_TASK_TYPE_STANDARD) { + /* standard tasks are mapped directly to a node */ + return itable_lookup(vg->task_id_to_node, (uint64_t)task->task_id); + } else if (task->type == VINE_TASK_TYPE_RECOVERY) { + /* note that recovery tasks are not mapped to any node but we still need the original node for pruning, + * so we look up the outfile of the task, then map it back to get the original node */ + struct vine_mount *mount; + LIST_ITERATE(task->output_mounts, mount) + { + uint64_t original_producer_task_id = mount->file->original_producer_task_id; + if (original_producer_task_id > 0) { + return itable_lookup(vg->task_id_to_node, original_producer_task_id); + } + } + } + + debug(D_ERROR, "task %d has no original producer task id", task->task_id); + + return NULL; +} + +/** + * Prune the ancestors of a persisted node. This is only used for persisted nodes that produce persisted files. + * All ancestors we consider here include both temp nodes and persisted nodes, because data written to the shared file system + * is safe and can definitely trigger upstream data redundancy to be released. + * @param vg Reference to the vine graph. + * @param node Reference to the node object. + * @return The number of pruned replicas. + */ +static int prune_ancestors_of_persisted_node(struct vine_graph *vg, struct vine_node *node) +{ + if (!vg || !node) { + return -1; + } + + /* find all safe ancestors */ + struct set *safe_ancestors = vine_node_find_safe_ancestors(node); + if (!safe_ancestors) { + return 0; + } + + int pruned_replica_count = 0; + + timestamp_t start_time = timestamp_get(); + + /* prune all safe ancestors */ + struct vine_node *ancestor_node; + SET_ITERATE(safe_ancestors, ancestor_node) + { + switch (ancestor_node->outfile_type) { + case NODE_OUTFILE_TYPE_LOCAL: + /* do not prune the local file */ + break; + case NODE_OUTFILE_TYPE_TEMP: + /* prune the temp file */ + vine_prune_file(vg->manager, ancestor_node->outfile); + break; + case NODE_OUTFILE_TYPE_SHARED_FILE_SYSTEM: + /* unlink directly from the shared file system */ + unlink(ancestor_node->outfile_remote_name); + break; + } + ancestor_node->prune_status = PRUNE_STATUS_SAFE; + pruned_replica_count++; + } + + set_delete(safe_ancestors); + + node->time_spent_on_prune_ancestors_of_persisted_node += timestamp_get() - start_time; + + return pruned_replica_count; +} + +/** + * Prune the ancestors of a temp node. + * This function opportunistically releases upstream temporary files + * that are no longer needed once this temp-producing node has completed. + * + * Only ancestors producing temporary outputs are considered here. + * Files stored in the shared filesystem are never pruned by this function, + * because temp outputs are not considered sufficiently safe to trigger + * deletion of persisted data upstream. + * @param vg Reference to the vine graph. + * @param node Reference to the node object. + * @return The number of pruned replicas. + */ +static int prune_ancestors_of_temp_node(struct vine_graph *vg, struct vine_node *node) +{ + if (!vg || !node || !node->outfile || node->prune_depth <= 0) { + return 0; + } + + timestamp_t start_time = timestamp_get(); + + int pruned_replica_count = 0; + + struct list *parents = vine_node_find_parents_by_depth(node, node->prune_depth); + + struct vine_node *parent_node; + LIST_ITERATE(parents, parent_node) + { + /* skip if the parent does not produce a temp file */ + if (parent_node->outfile_type != NODE_OUTFILE_TYPE_TEMP) { + continue; + } + + /* a file is prunable if its outfile is no longer needed by any child node: + * 1. it has no pending dependents + * 2. all completed dependents have also completed their corresponding recovery tasks, if any */ + int all_children_completed = 1; + struct vine_node *child_node; + LIST_ITERATE(parent_node->children, child_node) + { + /* break early if the child node is not completed */ + if (!child_node->completed) { + all_children_completed = 0; + break; + } + /* if the task produces a temp file and the recovery task is running, the parent is not prunable */ + if (child_node->outfile && child_node->outfile->type == VINE_TEMP) { + struct vine_task *child_node_recovery_task = child_node->outfile->recovery_task; + if (child_node_recovery_task && (child_node_recovery_task->state != VINE_TASK_INITIAL && child_node_recovery_task->state != VINE_TASK_DONE)) { + all_children_completed = 0; + break; + } + } + } + if (!all_children_completed) { + continue; + } + + pruned_replica_count += vine_prune_file(vg->manager, parent_node->outfile); + /* this parent is pruned because a successor that produces a temp file is completed, it is unsafe because the + * manager may submit a recovery task to bring it back in case of worker failures. */ + parent_node->prune_status = PRUNE_STATUS_UNSAFE; + } + + list_delete(parents); + + node->time_spent_on_prune_ancestors_of_temp_node += timestamp_get() - start_time; + + return pruned_replica_count; +} + +/** + * Prune the ancestors of a node when it is completed. + * @param node Reference to the node object. + */ +static void prune_ancestors_of_node(struct vine_graph *vg, struct vine_node *node) +{ + if (!vg || !node) { + return; + } + + /* do not prune if the node has not completed */ + if (!node->completed) { + return; + } + + timestamp_t start_time = timestamp_get(); + + int pruned_replica_count = 0; + + switch (node->outfile_type) { + case NODE_OUTFILE_TYPE_LOCAL: + case NODE_OUTFILE_TYPE_SHARED_FILE_SYSTEM: + /* If the outfile was declared as a VINE_FILE or was written to the shared fs, then it is guaranteed to be persisted + * and there is no chance that it will be lost unexpectedly. So we can safely prune all ancestors of this node. */ + pruned_replica_count = prune_ancestors_of_persisted_node(vg, node); + break; + case NODE_OUTFILE_TYPE_TEMP: + /* Otherwise, if the node outfile is a temp file, we need to be careful about pruning, because temp files are prone + * to failures, while means they can be lost due to node evictions or failures. */ + pruned_replica_count = prune_ancestors_of_temp_node(vg, node); + break; + } + + timestamp_t elapsed_time = timestamp_get() - start_time; + + debug(D_VINE, "pruned %d ancestors of node %" PRIu64 " in %.6f seconds", pruned_replica_count, node->node_id, elapsed_time / 1000000.0); + + return; +} + +/** + * Print the time metrics of the vine graph to a csv file. + * @param vg Reference to the vine graph. + * @param filename Reference to the filename of the csv file. + */ +static void print_time_metrics(struct vine_graph *vg, const char *filename) +{ + if (!vg) { + return; + } + + /* first delete the file if it exists */ + if (access(filename, F_OK) != -1) { + unlink(filename); + } + + /* print the header as a csv file */ + FILE *fp = fopen(filename, "w"); + if (!fp) { + debug(D_ERROR, "failed to open file %s", filename); + return; + } + fprintf(fp, "node_id,submission_time_us,scheduling_time_us,commit_time_us,execution_time_us,retrieval_time_us,postprocessing_time_us\n"); + + uint64_t nid; + struct vine_node *node; + ITABLE_ITERATE(vg->nodes, nid, node) + { + fprintf(fp, "%" PRIu64 ",%lu,%lu,%lu,%lu,%lu,%lu\n", node->node_id, node->submission_time, node->scheduling_time, node->commit_time, node->execution_time, node->retrieval_time, node->postprocessing_time); + } + fclose(fp); + + return; +} + +/*************************************************************/ +/* Public APIs */ +/*************************************************************/ + +/** Tune the vine graph. + *@param vg Reference to the vine graph. + *@param name Reference to the name of the parameter to tune. + *@param value Reference to the value of the parameter to tune. + *@return 0 on success, -1 on failure. + */ +int vine_graph_tune(struct vine_graph *vg, const char *name, const char *value) +{ + if (!vg || !name || !value) { + return -1; + } + + if (strcmp(name, "failure-injection-step-percent") == 0) { + vg->failure_injection_step_percent = atof(value); + + } else if (strcmp(name, "task-priority-mode") == 0) { + if (strcmp(value, "random") == 0) { + vg->task_priority_mode = TASK_PRIORITY_MODE_RANDOM; + } else if (strcmp(value, "depth-first") == 0) { + vg->task_priority_mode = TASK_PRIORITY_MODE_DEPTH_FIRST; + } else if (strcmp(value, "breadth-first") == 0) { + vg->task_priority_mode = TASK_PRIORITY_MODE_BREADTH_FIRST; + } else if (strcmp(value, "fifo") == 0) { + vg->task_priority_mode = TASK_PRIORITY_MODE_FIFO; + } else if (strcmp(value, "lifo") == 0) { + vg->task_priority_mode = TASK_PRIORITY_MODE_LIFO; + } else if (strcmp(value, "largest-input-first") == 0) { + vg->task_priority_mode = TASK_PRIORITY_MODE_LARGEST_INPUT_FIRST; + } else if (strcmp(value, "largest-storage-footprint-first") == 0) { + vg->task_priority_mode = TASK_PRIORITY_MODE_LARGEST_STORAGE_FOOTPRINT_FIRST; + } else { + debug(D_ERROR, "invalid priority mode: %s", value); + return -1; + } + + } else if (strcmp(name, "output-dir") == 0) { + if (vg->output_dir) { + free(vg->output_dir); + } + if (mkdir(value, 0777) != 0 && errno != EEXIST) { + debug(D_ERROR, "failed to mkdir %s (errno=%d)", value, errno); + return -1; + } + vg->output_dir = xxstrdup(value); + + } else if (strcmp(name, "prune-depth") == 0) { + vg->prune_depth = atoi(value); + + } else if (strcmp(name, "checkpoint-fraction") == 0) { + double fraction = atof(value); + if (fraction < 0.0 || fraction > 1.0) { + debug(D_ERROR, "invalid checkpoint fraction: %s (must be between 0.0 and 1.0)", value); + return -1; + } + vg->checkpoint_fraction = fraction; + + } else if (strcmp(name, "checkpoint-dir") == 0) { + if (vg->checkpoint_dir) { + free(vg->checkpoint_dir); + } + if (mkdir(value, 0777) != 0 && errno != EEXIST) { + debug(D_ERROR, "failed to mkdir %s (errno=%d)", value, errno); + return -1; + } + vg->checkpoint_dir = xxstrdup(value); + + } else if (strcmp(name, "progress-bar-update-interval-sec") == 0) { + double val = atof(value); + vg->progress_bar_update_interval_sec = (val > 0.0) ? val : 0.1; + + } else if (strcmp(name, "time-metrics-filename") == 0) { + if (strcmp(value, "0") == 0) { + return 0; + } + + if (vg->time_metrics_filename) { + free(vg->time_metrics_filename); + } + + vg->time_metrics_filename = xxstrdup(value); + + /** Extract parent directory inline **/ + const char *slash = strrchr(vg->time_metrics_filename, '/'); + if (slash) { + size_t len = slash - vg->time_metrics_filename; + char *parent = malloc(len + 1); + memcpy(parent, vg->time_metrics_filename, len); + parent[len] = '\0'; + + /** Ensure the parent directory exists **/ + if (mkdir(parent, 0777) != 0 && errno != EEXIST) { + debug(D_ERROR, "failed to mkdir %s (errno=%d)", parent, errno); + free(parent); + return -1; + } + free(parent); + } + + /** Truncate or create the file **/ + FILE *fp = fopen(vg->time_metrics_filename, "w"); + if (!fp) { + debug(D_ERROR, "failed to create file %s (errno=%d)", vg->time_metrics_filename, errno); + return -1; + } + fclose(fp); + + } else if (strcmp(name, "enable-debug-log") == 0) { + if (vg->enable_debug_log == 0) { + return -1; + } + vg->enable_debug_log = (atoi(value) == 1) ? 1 : 0; + if (vg->enable_debug_log == 0) { + debug_flags_clear(); + debug_close(); + } + + } else { + debug(D_ERROR, "invalid parameter name: %s", name); + return -1; + } + + return 0; +} + +/** + * Get the outfile remote name of a node in the vine graph. + * @param vg Reference to the vine graph. + * @param node_id Reference to the node id. + * @return The outfile remote name. + */ +const char *vine_graph_get_node_outfile_remote_name(const struct vine_graph *vg, uint64_t node_id) +{ + if (!vg) { + return NULL; + } + + struct vine_node *node = itable_lookup(vg->nodes, node_id); + if (!node) { + return NULL; + } + + return node->outfile_remote_name; +} + +/** + * Get the proxy library name of the vine graph. + * @param vg Reference to the vine graph. + * @return The proxy library name. + */ +const char *vine_graph_get_proxy_library_name(const struct vine_graph *vg) +{ + if (!vg) { + return NULL; + } + + return vg->proxy_library_name; +} + +/** + * Set the proxy function name of the vine graph. + * @param vg Reference to the vine graph. + * @param proxy_function_name Reference to the proxy function name. + */ +void vine_graph_set_proxy_function_name(struct vine_graph *vg, const char *proxy_function_name) +{ + if (!vg || !proxy_function_name) { + return; + } + + if (vg->proxy_function_name) { + free(vg->proxy_function_name); + } + + vg->proxy_function_name = xxstrdup(proxy_function_name); +} + +/** + * Get the heavy score of a node in the vine graph. + * @param vg Reference to the vine graph. + * @param node_id Reference to the node id. + * @return The heavy score. + */ +double vine_graph_get_node_heavy_score(const struct vine_graph *vg, uint64_t node_id) +{ + if (!vg) { + return -1; + } + + struct vine_node *node = itable_lookup(vg->nodes, node_id); + if (!node) { + return -1; + } + + return node->heavy_score; +} + +/** + * Get the local outfile source of a node in the vine graph, only valid for local output files. + * The source of a local output file is the path on the local filesystem. + * @param vg Reference to the vine graph. + * @param node_id Reference to the node id. + * @return The local outfile source. + */ +const char *vine_graph_get_node_local_outfile_source(const struct vine_graph *vg, uint64_t node_id) +{ + if (!vg) { + return NULL; + } + + struct vine_node *node = itable_lookup(vg->nodes, node_id); + if (!node) { + debug(D_ERROR, "node %" PRIu64 " not found", node_id); + exit(1); + } + + if (node->outfile_type != NODE_OUTFILE_TYPE_LOCAL) { + debug(D_ERROR, "node %" PRIu64 " is not a local output file", node_id); + exit(1); + } + + return node->outfile->source; +} + +/** + * Compute the topology metrics of the vine graph, including depth, height, upstream and downstream counts, + * heavy scores, and weakly connected components. Must be called after all nodes and dependencies are added. + * @param vg Reference to the vine graph. + */ +void vine_graph_compute_topology_metrics(struct vine_graph *vg) +{ + if (!vg) { + return; + } + + /* get nodes in topological order */ + struct list *topo_order = get_topological_order(vg); + if (!topo_order) { + return; + } + + struct vine_node *node; + struct vine_node *parent_node; + struct vine_node *child_node; + + /* compute the depth of the node */ + LIST_ITERATE(topo_order, node) + { + node->depth = 0; + LIST_ITERATE(node->parents, parent_node) + { + if (node->depth < parent_node->depth + 1) { + node->depth = parent_node->depth + 1; + } + } + } + + /* compute the height of the node */ + LIST_ITERATE_REVERSE(topo_order, node) + { + node->height = 0; + LIST_ITERATE(node->children, child_node) + { + if (node->height < child_node->height + 1) { + node->height = child_node->height + 1; + } + } + } + + /* compute the upstream and downstream counts for each node */ + struct itable *upstream_map = itable_create(0); + struct itable *downstream_map = itable_create(0); + uint64_t nid_tmp; + ITABLE_ITERATE(vg->nodes, nid_tmp, node) + { + struct set *upstream = set_create(0); + struct set *downstream = set_create(0); + itable_insert(upstream_map, node->node_id, upstream); + itable_insert(downstream_map, node->node_id, downstream); + } + LIST_ITERATE(topo_order, node) + { + struct set *upstream = itable_lookup(upstream_map, node->node_id); + LIST_ITERATE(node->parents, parent_node) + { + struct set *parent_upstream = itable_lookup(upstream_map, parent_node->node_id); + set_union(upstream, parent_upstream); + set_insert(upstream, parent_node); + } + } + LIST_ITERATE_REVERSE(topo_order, node) + { + struct set *downstream = itable_lookup(downstream_map, node->node_id); + LIST_ITERATE(node->children, child_node) + { + struct set *child_downstream = itable_lookup(downstream_map, child_node->node_id); + set_union(downstream, child_downstream); + set_insert(downstream, child_node); + } + } + LIST_ITERATE(topo_order, node) + { + node->upstream_subgraph_size = set_size(itable_lookup(upstream_map, node->node_id)); + node->downstream_subgraph_size = set_size(itable_lookup(downstream_map, node->node_id)); + node->fan_in = list_size(node->parents); + node->fan_out = list_size(node->children); + set_delete(itable_lookup(upstream_map, node->node_id)); + set_delete(itable_lookup(downstream_map, node->node_id)); + } + itable_delete(upstream_map); + itable_delete(downstream_map); + + /* compute the heavy score for each node */ + LIST_ITERATE(topo_order, node) + { + node->heavy_score = compute_node_heavy_score(node); + } + + /* sort nodes using priority queue */ + int total_nodes = list_size(topo_order); + int total_target_nodes = 0; + struct priority_queue *sorted_nodes = priority_queue_create(total_nodes); + LIST_ITERATE(topo_order, node) + { + if (node->is_target) { + total_target_nodes++; + } + priority_queue_push(sorted_nodes, node, node->heavy_score); + } + /* calculate the number of nodes to be checkpointed */ + int checkpoint_count = (int)((total_nodes - total_target_nodes) * vg->checkpoint_fraction); + if (checkpoint_count < 0) { + checkpoint_count = 0; + } + + /* assign outfile types to each node */ + int assigned_checkpoint_count = 0; + while ((node = priority_queue_pop(sorted_nodes))) { + if (node->is_target) { + /* declare the output file as a vine_file so that it can be retrieved by the manager as usual */ + node->outfile_type = NODE_OUTFILE_TYPE_LOCAL; + char *local_outfile_path = string_format("%s/%s", vg->output_dir, node->outfile_remote_name); + node->outfile = vine_declare_file(vg->manager, local_outfile_path, VINE_CACHE_LEVEL_WORKFLOW, 0); + free(local_outfile_path); + continue; + } + if (assigned_checkpoint_count < checkpoint_count) { + /* checkpointed files will be written directly to the shared file system, no need to manage them in the manager */ + node->outfile_type = NODE_OUTFILE_TYPE_SHARED_FILE_SYSTEM; + char *shared_file_system_outfile_path = string_format("%s/%s", vg->checkpoint_dir, node->outfile_remote_name); + free(node->outfile_remote_name); + node->outfile_remote_name = shared_file_system_outfile_path; + node->outfile = NULL; + assigned_checkpoint_count++; + } else { + /* other nodes will be declared as temp files to leverage node-local storage */ + node->outfile_type = NODE_OUTFILE_TYPE_TEMP; + node->outfile = vine_declare_temp(vg->manager); + } + } + /* track the output dependencies of regular and vine_temp nodes */ + LIST_ITERATE(topo_order, node) + { + if (node->outfile) { + vine_task_add_output(node->task, node->outfile, node->outfile_remote_name, VINE_TRANSFER_ALWAYS); + } + } + priority_queue_delete(sorted_nodes); + + /* extract weakly connected components */ + struct list *weakly_connected_components = extract_weakly_connected_components(vg); + struct list *component; + int component_index = 0; + debug(D_VINE, "graph has %d weakly connected components\n", list_size(weakly_connected_components)); + LIST_ITERATE(weakly_connected_components, component) + { + debug(D_VINE, "component %d size: %d\n", component_index, list_size(component)); + list_delete(component); + component_index++; + } + list_delete(weakly_connected_components); + + list_delete(topo_order); + + return; +} + +/** + * Create a new node and track it in the vine graph. + * @param vg Reference to the vine graph. + * @return The auto-assigned node id. + */ +uint64_t vine_graph_add_node(struct vine_graph *vg) +{ + if (!vg) { + return 0; + } + + /* assign a new id based on current node count, ensure uniqueness */ + uint64_t candidate_id = itable_size(vg->nodes); + candidate_id += 1; + while (itable_lookup(vg->nodes, candidate_id)) { + candidate_id++; + } + uint64_t node_id = candidate_id; + + /* create the backing node (defaults to non-target) */ + struct vine_node *node = vine_node_create(node_id); + + if (!node) { + debug(D_ERROR, "failed to create node %" PRIu64, node_id); + vine_graph_delete(vg); + exit(1); + } + + if (!vg->proxy_function_name) { + debug(D_ERROR, "proxy function name is not set"); + vine_graph_delete(vg); + exit(1); + } + + if (!vg->proxy_library_name) { + debug(D_ERROR, "proxy library name is not set"); + vine_graph_delete(vg); + exit(1); + } + + /* create node task */ + node->task = vine_task_create(vg->proxy_function_name); + vine_task_set_library_required(node->task, vg->proxy_library_name); + vine_task_addref(node->task); + + /* construct the task arguments and declare the infile */ + char *task_arguments = vine_node_construct_task_arguments(node); + node->infile = vine_declare_buffer(vg->manager, task_arguments, strlen(task_arguments), VINE_CACHE_LEVEL_TASK, VINE_UNLINK_WHEN_DONE); + free(task_arguments); + vine_task_add_input(node->task, node->infile, "infile", VINE_TRANSFER_ALWAYS); + + /* initialize the pruning depth of each node, currently statically set to the global prune depth */ + node->prune_depth = vg->prune_depth; + + itable_insert(vg->nodes, node_id, node); + + return node_id; +} + +/** + * Mark a node as a retrieval target. + */ +void vine_graph_set_target(struct vine_graph *vg, uint64_t node_id) +{ + if (!vg) { + return; + } + struct vine_node *node = itable_lookup(vg->nodes, node_id); + if (!node) { + debug(D_ERROR, "node %" PRIu64 " not found", node_id); + exit(1); + } + node->is_target = 1; +} + +/** + * Create a new vine graph and bind a manager to it. + * @param q Reference to the manager object. + * @return A new vine graph instance. + */ +struct vine_graph *vine_graph_create(struct vine_manager *q) +{ + if (!q) { + return NULL; + } + + struct vine_graph *vg = xxmalloc(sizeof(struct vine_graph)); + + vg->manager = q; + + vg->checkpoint_dir = xxstrdup(vg->manager->runtime_directory); // default to current working directory + vg->output_dir = xxstrdup(vg->manager->runtime_directory); // default to current working directory + + vg->nodes = itable_create(0); + vg->task_id_to_node = itable_create(0); + vg->outfile_cachename_to_node = hash_table_create(0, 0); + + cctools_uuid_t proxy_library_name_id; + cctools_uuid_create(&proxy_library_name_id); + vg->proxy_library_name = xxstrdup(proxy_library_name_id.str); + + vg->proxy_function_name = NULL; + + vg->prune_depth = 1; + + vg->task_priority_mode = TASK_PRIORITY_MODE_LARGEST_INPUT_FIRST; + vg->failure_injection_step_percent = -1.0; + + vg->progress_bar_update_interval_sec = 0.1; + + /* enable debug system for C code since it uses a separate debug system instance + * from the Python bindings. Use the same function that the manager uses. */ + char *debug_tmp = string_format("%s/vine-logs/debug", vg->manager->runtime_directory); + vine_enable_debug_log(debug_tmp); + free(debug_tmp); + + vg->time_metrics_filename = NULL; + + vg->enable_debug_log = 1; + + return vg; +} + +/** + * Add a dependency between two nodes in the vine graph. Note that the input-output file relationship + * is not handled here, because their file names might not have been determined yet. + * @param vg Reference to the vine graph. + * @param parent_id Reference to the parent node id. + * @param child_id Reference to the child node id. + */ +void vine_graph_add_dependency(struct vine_graph *vg, uint64_t parent_id, uint64_t child_id) +{ + if (!vg) { + return; + } + + struct vine_node *parent_node = itable_lookup(vg->nodes, parent_id); + struct vine_node *child_node = itable_lookup(vg->nodes, child_id); + if (!parent_node) { + debug(D_ERROR, "parent node %" PRIu64 " not found", parent_id); + uint64_t nid; + struct vine_node *node; + printf("parent_ids:\n"); + ITABLE_ITERATE(vg->nodes, nid, node) + { + printf(" %" PRIu64 "\n", node->node_id); + } + exit(1); + } + if (!child_node) { + debug(D_ERROR, "child node %" PRIu64 " not found", child_id); + exit(1); + } + + list_push_tail(child_node->parents, parent_node); + list_push_tail(parent_node->children, child_node); + + return; +} + +/** + * Execute the vine graph. This must be called after all nodes and dependencies are added and the topology metrics are computed. + * @param vg Reference to the vine graph. + */ +void vine_graph_execute(struct vine_graph *vg) +{ + if (!vg) { + return; + } + + signal(SIGINT, handle_sigint); + + debug(D_VINE, "start executing vine graph"); + + /* print the info of all nodes */ + uint64_t nid_iter; + struct vine_node *node; + ITABLE_ITERATE(vg->nodes, nid_iter, node) + { + vine_node_debug_print(node); + } + + /* enable return recovery tasks */ + vine_enable_return_recovery_tasks(vg->manager); + + /* create mappings from task IDs and outfile cache names to nodes */ + ITABLE_ITERATE(vg->nodes, nid_iter, node) + { + if (node->outfile) { + hash_table_insert(vg->outfile_cachename_to_node, node->outfile->cached_name, node); + } + } + + /* add the parents' outfiles as inputs to the task */ + struct list *topo_order = get_topological_order(vg); + LIST_ITERATE(topo_order, node) + { + struct vine_node *parent_node; + LIST_ITERATE(node->parents, parent_node) + { + if (parent_node->outfile) { + vine_task_add_input(node->task, parent_node->outfile, parent_node->outfile_remote_name, VINE_TRANSFER_ALWAYS); + } + } + } + + /* initialize pending_parents for all nodes */ + ITABLE_ITERATE(vg->nodes, nid_iter, node) + { + struct vine_node *parent_node; + LIST_ITERATE(node->parents, parent_node) + { + if (node->pending_parents) { + /* Use parent pointer to ensure pointer consistency */ + set_insert(node->pending_parents, parent_node); + } + } + } + + /* enqueue those without dependencies */ + ITABLE_ITERATE(vg->nodes, nid_iter, node) + { + if (!node->pending_parents || set_size(node->pending_parents) == 0) { + submit_node_task(vg, node); + } + } + + /* calculate steps to inject failure */ + double next_failure_threshold = -1.0; + if (vg->failure_injection_step_percent > 0) { + next_failure_threshold = vg->failure_injection_step_percent / 100.0; + } + + struct ProgressBar *pbar = progress_bar_init("Executing Tasks"); + progress_bar_set_update_interval(pbar, vg->progress_bar_update_interval_sec); + + struct ProgressBarPart *regular_tasks_part = progress_bar_create_part("Regular", itable_size(vg->nodes)); + struct ProgressBarPart *recovery_tasks_part = progress_bar_create_part("Recovery", 0); + progress_bar_bind_part(pbar, regular_tasks_part); + progress_bar_bind_part(pbar, recovery_tasks_part); + + int wait_timeout = 1; + + while (regular_tasks_part->current < regular_tasks_part->total) { + if (interrupted) { + break; + } + + struct vine_task *task = vine_wait(vg->manager, wait_timeout); + progress_bar_set_part_total(pbar, recovery_tasks_part, vg->manager->num_submitted_recovery_tasks); + if (task) { + /* retrieve all possible tasks */ + wait_timeout = 0; + + timestamp_t time_when_postprocessing_start = timestamp_get(); + + /* get the original node by task id */ + struct vine_node *node = get_node_by_task(vg, task); + if (!node) { + debug(D_ERROR, "fatal: task %d could not be mapped to a task node, this indicates a serious bug.", task->task_id); + exit(1); + } + + /* in case of failure, resubmit this task */ + if (node->task->result != VINE_RESULT_SUCCESS || node->task->exit_code != 0) { + if (node->retry_attempts_left <= 0) { + debug(D_ERROR, "Task %d failed (result=%d, exit=%d). Node %" PRIu64 " has no retries left. Aborting.", task->task_id, node->task->result, node->task->exit_code, node->node_id); + vine_graph_delete(vg); + exit(1); + } + node->retry_attempts_left--; + debug(D_VINE | D_NOTICE, "Task %d failed (result=%d, exit=%d). Retrying node %" PRIu64 " (remaining=%d)...", task->task_id, node->task->result, node->task->exit_code, node->node_id, node->retry_attempts_left); + vine_task_reset(node->task); + submit_node_task(vg, node); + continue; + } + + /* if the outfile is set to save on the sharedfs, stat to get the size of the file */ + switch (node->outfile_type) { + case NODE_OUTFILE_TYPE_SHARED_FILE_SYSTEM: { + struct stat info; + int result = stat(node->outfile_remote_name, &info); + if (result < 0) { + if (node->retry_attempts_left <= 0) { + debug(D_ERROR, "Task %d succeeded but missing sharedfs output %s; no retries left for node %" PRIu64 ". Aborting.", task->task_id, node->outfile_remote_name, node->node_id); + vine_graph_delete(vg); + exit(1); + } + node->retry_attempts_left--; + debug(D_VINE | D_NOTICE, "Task %d succeeded but missing sharedfs output %s; retrying node %" PRIu64 " (remaining=%d)...", task->task_id, node->outfile_remote_name, node->node_id, node->retry_attempts_left); + vine_task_reset(node->task); + submit_node_task(vg, node); + continue; + } + node->outfile_size_bytes = info.st_size; + break; + } + case NODE_OUTFILE_TYPE_LOCAL: + case NODE_OUTFILE_TYPE_TEMP: + node->outfile_size_bytes = node->outfile->size; + break; + } + debug(D_VINE, "Node %" PRIu64 " completed with outfile %s size: %zu bytes", node->node_id, node->outfile_remote_name, node->outfile_size_bytes); + + /* mark the node as completed */ + node->completed = 1; + node->scheduling_time = task->time_when_scheduling_end - task->time_when_scheduling_start; + node->commit_time = task->time_when_commit_end - task->time_when_commit_start; + node->execution_time = task->time_workers_execute_last; + node->retrieval_time = task->time_when_get_result_end - task->time_when_get_result_start; + + /* prune nodes on task completion */ + prune_ancestors_of_node(vg, node); + + /* skip recovery tasks */ + if (task->type == VINE_TASK_TYPE_RECOVERY) { + progress_bar_update_part(pbar, recovery_tasks_part, 1); + continue; + } + + /* set the start time to the submit time of the first regular task */ + if (regular_tasks_part->current == 0) { + progress_bar_set_start_time(pbar, task->time_when_commit_start); + } + + /* update critical time */ + vine_node_update_critical_path_time(node, node->execution_time); + + /* mark this regular task as completed */ + progress_bar_update_part(pbar, regular_tasks_part, 1); + + /* inject failure */ + if (vg->failure_injection_step_percent > 0) { + double progress = (double)regular_tasks_part->current / (double)regular_tasks_part->total; + if (progress >= next_failure_threshold && evict_random_worker(vg->manager)) { + debug(D_VINE, "evicted a worker at %.2f%% (threshold %.2f%%)", progress * 100, next_failure_threshold * 100); + next_failure_threshold += vg->failure_injection_step_percent / 100.0; + } + } + + /* enqueue the output file for replication */ + switch (node->outfile_type) { + case NODE_OUTFILE_TYPE_TEMP: + /* replicate the outfile of the temp node */ + vine_temp_replicate_file_later(vg->manager, node->outfile); + break; + case NODE_OUTFILE_TYPE_LOCAL: + case NODE_OUTFILE_TYPE_SHARED_FILE_SYSTEM: + break; + } + + /* submit children nodes with dependencies all resolved */ + submit_unblocked_children(vg, node); + + timestamp_t time_when_postprocessing_end = timestamp_get(); + node->postprocessing_time = time_when_postprocessing_end - time_when_postprocessing_start; + } else { + wait_timeout = 1; + progress_bar_update_part(pbar, recovery_tasks_part, 0); // refresh the time and total for recovery tasks + } + } + + progress_bar_finish(pbar); + progress_bar_delete(pbar); + + double total_time_spent_on_unlink_local_files = 0; + double total_time_spent_on_prune_ancestors_of_temp_node = 0; + double total_time_spent_on_prune_ancestors_of_persisted_node = 0; + ITABLE_ITERATE(vg->nodes, nid_iter, node) + { + total_time_spent_on_unlink_local_files += node->time_spent_on_unlink_local_files; + total_time_spent_on_prune_ancestors_of_temp_node += node->time_spent_on_prune_ancestors_of_temp_node; + total_time_spent_on_prune_ancestors_of_persisted_node += node->time_spent_on_prune_ancestors_of_persisted_node; + } + total_time_spent_on_unlink_local_files /= 1e6; + total_time_spent_on_prune_ancestors_of_temp_node /= 1e6; + total_time_spent_on_prune_ancestors_of_persisted_node /= 1e6; + + debug(D_VINE, "total time spent on prune ancestors of temp node: %.6f seconds\n", total_time_spent_on_prune_ancestors_of_temp_node); + debug(D_VINE, "total time spent on prune ancestors of persisted node: %.6f seconds\n", total_time_spent_on_prune_ancestors_of_persisted_node); + debug(D_VINE, "total time spent on unlink local files: %.6f seconds\n", total_time_spent_on_unlink_local_files); + + if (vg->time_metrics_filename) { + print_time_metrics(vg, vg->time_metrics_filename); + } + + return; +} + +/** + * Delete a vine graph instance. + * @param vg Reference to the vine graph. + */ +void vine_graph_delete(struct vine_graph *vg) +{ + if (!vg) { + return; + } + + uint64_t nid; + struct vine_node *node; + ITABLE_ITERATE(vg->nodes, nid, node) + { + if (node->infile) { + vine_prune_file(vg->manager, node->infile); + hash_table_remove(vg->manager->file_table, node->infile->cached_name); + } + if (node->outfile) { + vine_prune_file(vg->manager, node->outfile); + hash_table_remove(vg->outfile_cachename_to_node, node->outfile->cached_name); + hash_table_remove(vg->manager->file_table, node->outfile->cached_name); + } + if (node->outfile_type == NODE_OUTFILE_TYPE_SHARED_FILE_SYSTEM) { + unlink(node->outfile_remote_name); + } + vine_node_delete(node); + } + + free(vg->proxy_library_name); + free(vg->proxy_function_name); + + itable_delete(vg->nodes); + itable_delete(vg->task_id_to_node); + hash_table_delete(vg->outfile_cachename_to_node); + free(vg); +} diff --git a/taskvine/src/graph/vinedag/vine_graph/vine_graph.h b/taskvine/src/graph/vinedag/vine_graph/vine_graph.h new file mode 100644 index 0000000000..e3f78f5c83 --- /dev/null +++ b/taskvine/src/graph/vinedag/vine_graph/vine_graph.h @@ -0,0 +1,150 @@ +#ifndef VINE_GRAPH_H +#define VINE_GRAPH_H + +#include + +#include "vine_task.h" +#include "hash_table.h" +#include "itable.h" +#include "list.h" +#include "vine_manager.h" +#include "set.h" +#include "vine_node.h" +#include "taskvine.h" + +/** The task priority algorithm used for vine graph scheduling. */ +typedef enum { + TASK_PRIORITY_MODE_RANDOM = 0, /**< Assign random priority to tasks */ + TASK_PRIORITY_MODE_DEPTH_FIRST, /**< Prioritize deeper tasks first */ + TASK_PRIORITY_MODE_BREADTH_FIRST, /**< Prioritize shallower tasks first */ + TASK_PRIORITY_MODE_FIFO, /**< First in, first out priority */ + TASK_PRIORITY_MODE_LIFO, /**< Last in, first out priority */ + TASK_PRIORITY_MODE_LARGEST_INPUT_FIRST, /**< Prioritize tasks with larger inputs first */ + TASK_PRIORITY_MODE_LARGEST_STORAGE_FOOTPRINT_FIRST /**< Prioritize tasks with larger storage footprint first */ +} task_priority_mode_t; + +/** The vine graph (logical scheduling layer). */ +struct vine_graph { + struct vine_manager *manager; + struct itable *nodes; + struct itable *task_id_to_node; + struct hash_table *outfile_cachename_to_node; + + /* The directory to store the checkpointed results. + * Only intermediate results can be checkpointed, the fraction of intermediate results to checkpoint is controlled by the checkpoint-fraction parameter. */ + char *checkpoint_dir; + + /* Results of target nodes will be stored in this directory. + * This dir path can not necessarily be a shared file system directory, + * output files will be retrieved through the network instead, + * as long as the manager can access it. */ + char *output_dir; + + /* Python-side proxy library name. The context_graph runtime owns this library and sends calls into the vine graph + * so the manager can execute them through the proxy function. */ + char *proxy_library_name; + + /* The proxy function lives inside that library. It receives vine node IDs, looks up the + * Python callable and arguments inside the context_graph runtime, and executes the work. The runtime generates the name + * and shares it with the vine graph. */ + char *proxy_function_name; + + /* The depth of the pruning strategy. 0 means no pruning, 1 means the most aggressive pruning. */ + int prune_depth; + double checkpoint_fraction; /* 0 - 1, the fraction of intermediate results to checkpoint */ + + task_priority_mode_t task_priority_mode; /* priority mode for task graph task scheduling */ + double failure_injection_step_percent; /* 0 - 100, the percentage of steps to inject failure */ + + double progress_bar_update_interval_sec; /* update interval for the progress bar in seconds */ + + /* The filename of the csv file to store the time metrics of the vine graph. */ + char *time_metrics_filename; + + int enable_debug_log; /* whether to enable debug log */ +}; + +/* Public APIs for operating the vine graph */ + +/** Create a vine graph and return it. +@param q Reference to the current manager object. +@return A new vine graph. +*/ +struct vine_graph *vine_graph_create(struct vine_manager *q); + +/** Create a new node in the vine graph. +@param vg Reference to the vine graph. +@return The auto-assigned node id. +*/ +uint64_t vine_graph_add_node(struct vine_graph *vg); + +/** Mark a node as a retrieval target. +@param vg Reference to the vine graph. +@param node_id Identifier of the node to mark as target. +*/ +void vine_graph_set_target(struct vine_graph *vg, uint64_t node_id); + +/** Add a dependency between two nodes in the vine graph. +@param vg Reference to the vine graph. +@param parent_id Identifier of the parent node. +@param child_id Identifier of the child node. +*/ +void vine_graph_add_dependency(struct vine_graph *vg, uint64_t parent_id, uint64_t child_id); + +/** Finalize the metrics of the vine graph. +@param vg Reference to the vine graph. +*/ +void vine_graph_compute_topology_metrics(struct vine_graph *vg); + +/** Get the heavy score of a node in the vine graph. +@param vg Reference to the vine graph. +@param node_id Identifier of the node. +@return The heavy score. +*/ +double vine_graph_get_node_heavy_score(const struct vine_graph *vg, uint64_t node_id); + +/** Execute the task graph. +@param vg Reference to the vine graph. +*/ +void vine_graph_execute(struct vine_graph *vg); + +/** Get the outfile remote name of a node in the vine graph. +@param vg Reference to the vine graph. +@param node_id Identifier of the node. +@return The outfile remote name. +*/ +const char *vine_graph_get_node_outfile_remote_name(const struct vine_graph *vg, uint64_t node_id); + +/** Get the local outfile source of a node in the vine graph. +@param vg Reference to the vine graph. +@param node_id Identifier of the node. +@return The local outfile source, or NULL if the node does not produce a local file. +*/ +const char *vine_graph_get_node_local_outfile_source(const struct vine_graph *vg, uint64_t node_id); + +/** Delete a vine graph. +@param vg Reference to the vine graph. +*/ +void vine_graph_delete(struct vine_graph *vg); + +/** Get the proxy library name of the vine graph. +@param vg Reference to the vine graph. +@return The proxy library name. +*/ +const char *vine_graph_get_proxy_library_name(const struct vine_graph *vg); + +/** Set the proxy function name of the vine graph. +@param vg Reference to the vine graph. +@param proxy_function_name Reference to the proxy function name. +*/ +void vine_graph_set_proxy_function_name(struct vine_graph *vg, const char *proxy_function_name); + +/** Tune the vine graph. +@param vg Reference to the vine graph. +@param name Reference to the name of the parameter to tune. +@param value Reference to the value of the parameter to tune. +@return 0 on success, -1 on failure. +*/ +int vine_graph_tune(struct vine_graph *vg, const char *name, const char *value); + +#endif // VINE_GRAPH_H diff --git a/taskvine/src/graph/vinedag/vine_graph/vine_graph.i b/taskvine/src/graph/vinedag/vine_graph/vine_graph.i new file mode 100644 index 0000000000..b87d428c01 --- /dev/null +++ b/taskvine/src/graph/vinedag/vine_graph/vine_graph.i @@ -0,0 +1,15 @@ +/* SWIG interface for local vinedag graph API bindings */ +%module vine_graph_capi + +%{ +#include "int_sizes.h" +#include "vine_graph.h" +%} + +%include "stdint.i" +%include "int_sizes.h" + +/* Import existing SWIG interface for type information (do not wrap again) */ +%import "../../bindings/python3/taskvine.i" + +%include "vine_graph.h" diff --git a/taskvine/src/graph/vinedag/vine_graph/vine_graph_client.py b/taskvine/src/graph/vinedag/vine_graph/vine_graph_client.py new file mode 100644 index 0000000000..6019e6e74c --- /dev/null +++ b/taskvine/src/graph/vinedag/vine_graph/vine_graph_client.py @@ -0,0 +1,75 @@ +# Copyright (C) 2025 The University of Notre Dame +# This software is distributed under the GNU General Public License. +# See the file COPYING for details. + +"""High-level client that exposes the C vine graph to Python callers.""" + +from . import vine_graph_capi + + +class VineGraphClient: + """Python-friendly wrapper that hides the raw SWIG API surface.""" + + def __init__(self, c_taskvine): + """Create and own the lifecycle of the backing C vine graph instance.""" + self._c_graph = vine_graph_capi.vine_graph_create(c_taskvine) + self._key_to_id = {} + self._id_to_key = {} + + def tune(self, name, value): + """Forward tuning parameters directly to the C vine graph.""" + vine_graph_capi.vine_graph_tune(self._c_graph, name, value) + + def add_node(self, key, is_target=None): + """Create a node in the C graph and remember the key↔id mapping.""" + node_id = vine_graph_capi.vine_graph_add_node(self._c_graph) + self._key_to_id[key] = node_id + self._id_to_key[node_id] = key + if is_target is not None and bool(is_target): + vine_graph_capi.vine_graph_set_target(self._c_graph, node_id) + return node_id + + def set_target(self, key): + """Mark an existing node as a target output.""" + node_id = self._key_to_id.get(key) + if node_id is None: + raise KeyError(f"Key not found: {key}") + vine_graph_capi.vine_graph_set_target(self._c_graph, node_id) + + def add_dependency(self, parent_key, child_key): + """Add an edge in the C graph using the remembered id mapping.""" + if parent_key not in self._key_to_id or child_key not in self._key_to_id: + raise KeyError("parent_key or child_key missing in mapping; call add_node() first") + vine_graph_capi.vine_graph_add_dependency( + self._c_graph, self._key_to_id[parent_key], self._key_to_id[child_key] + ) + + def compute_topology_metrics(self): + """Trigger the C graph to compute depth/height, heavy-score, etc.""" + vine_graph_capi.vine_graph_compute_topology_metrics(self._c_graph) + + def get_node_outfile_remote_name(self, key): + """Ask the C layer where a node's output will be stored.""" + if key not in self._key_to_id: + raise KeyError(f"Key not found: {key}") + return vine_graph_capi.vine_graph_get_node_outfile_remote_name( + self._c_graph, self._key_to_id[key] + ) + + def get_proxy_library_name(self): + """Expose the randomly generated proxy library name from the C side.""" + return vine_graph_capi.vine_graph_get_proxy_library_name(self._c_graph) + + def set_proxy_function(self, proxy_function): + """Tell the C graph which Python function should run on the workers.""" + vine_graph_capi.vine_graph_set_proxy_function_name( + self._c_graph, proxy_function.__name__ + ) + + def execute(self): + """Kick off execution; runs through SWIG down into the C orchestration loop.""" + vine_graph_capi.vine_graph_execute(self._c_graph) + + def delete(self): + """Release the C resources and clear the client.""" + vine_graph_capi.vine_graph_delete(self._c_graph) diff --git a/taskvine/src/graph/vinedag/vine_graph/vine_node.c b/taskvine/src/graph/vinedag/vine_graph/vine_node.c new file mode 100644 index 0000000000..86e0dfffb4 --- /dev/null +++ b/taskvine/src/graph/vinedag/vine_graph/vine_node.c @@ -0,0 +1,407 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "jx.h" +#include "jx_print.h" +#include "xxmalloc.h" +#include "stringtools.h" +#include "timestamp.h" +#include "set.h" +#include "hash_table.h" +#include "debug.h" +#include "random.h" +#include "uuid.h" + +#include "vine_file.h" +#include "vine_task.h" +#include "vine_worker_info.h" +#include "vine_temp.h" +#include "vine_node.h" +#include "taskvine.h" + +/*************************************************************/ +/* Private Functions */ +/*************************************************************/ + +/** + * Check if the outfile of a node is persisted. + * A node is considered persisted if it has completed and 1) the outfile is written to the shared file system, + * 2) the outfile is written to the local staging directory. + * @param node Reference to the node object. + * @return 1 if the outfile is persisted, 0 otherwise. + */ +static int node_outfile_has_been_persisted(struct vine_node *node) +{ + if (!node) { + return 0; + } + + /* if the node is not completed then the outfile is definitely not persisted */ + if (!node->completed) { + return 0; + } + + switch (node->outfile_type) { + case NODE_OUTFILE_TYPE_LOCAL: + return 1; + case NODE_OUTFILE_TYPE_SHARED_FILE_SYSTEM: + return 1; + case NODE_OUTFILE_TYPE_TEMP: + return 0; + } + + return 0; +} + +/** + * Update the critical path time of a node. + * @param node Reference to the node object. + * @param execution_time Reference to the execution time of the node. + */ +void vine_node_update_critical_path_time(struct vine_node *node, timestamp_t execution_time) +{ + timestamp_t max_parent_critical_path_time = 0; + struct vine_node *parent_node; + LIST_ITERATE(node->parents, parent_node) + { + if (parent_node->critical_path_time > max_parent_critical_path_time) { + max_parent_critical_path_time = parent_node->critical_path_time; + } + } + node->critical_path_time = max_parent_critical_path_time + execution_time; +} + +/** + * The dfs helper function for finding parents in a specific depth. + * @param node Reference to the node object. + * @param remaining_depth Reference to the remaining depth. + * @param result Reference to the result list. + * @param visited Reference to the visited set. + */ +static void find_parents_dfs(struct vine_node *node, int remaining_depth, struct list *result, struct set *visited) +{ + if (!node || set_lookup(visited, node)) { + return; + } + + set_insert(visited, node); + if (remaining_depth == 0) { + list_push_tail(result, node); + return; + } + struct vine_node *parent_node; + LIST_ITERATE(node->parents, parent_node) + { + find_parents_dfs(parent_node, remaining_depth - 1, result, visited); + } +} + +/*************************************************************/ +/* Public APIs */ +/*************************************************************/ + +/** + * Create a new vine node owned by the C-side graph. + * @param node_id Graph-assigned identifier that keeps C and Python in sync. + * @return Newly allocated vine node. + */ +struct vine_node *vine_node_create(uint64_t node_id) +{ + struct vine_node *node = xxmalloc(sizeof(struct vine_node)); + + node->is_target = 0; + node->node_id = node_id; + + /* create a unique UUID-based remote outfile name for this node */ + cctools_uuid_t uuid; + cctools_uuid_create(&uuid); + node->outfile_remote_name = xxstrdup(uuid.str); + + node->prune_status = PRUNE_STATUS_NOT_PRUNED; + node->parents = list_create(); + node->children = list_create(); + node->pending_parents = set_create(0); + node->completed = 0; + node->prune_depth = 0; + node->retry_attempts_left = 1; + node->outfile_size_bytes = 0; + + node->depth = -1; + node->height = -1; + node->upstream_subgraph_size = -1; + node->downstream_subgraph_size = -1; + node->fan_in = -1; + node->fan_out = -1; + node->heavy_score = -1; + + node->time_spent_on_unlink_local_files = 0; + node->time_spent_on_prune_ancestors_of_temp_node = 0; + node->time_spent_on_prune_ancestors_of_persisted_node = 0; + + node->submission_time = 0; + node->scheduling_time = 0; + node->commit_time = 0; + node->execution_time = 0; + node->retrieval_time = 0; + node->postprocessing_time = 0; + + node->critical_path_time = -1; + + return node; +} + +/** + * Construct the task arguments for the node. + * @param node Reference to the node object. + * @return The task arguments in JSON format: {"fn_args": [node_id], "fn_kwargs": {}}. + */ +char *vine_node_construct_task_arguments(struct vine_node *node) +{ + if (!node) { + return NULL; + } + + struct jx *event = jx_object(NULL); + struct jx *args = jx_array(NULL); + jx_array_append(args, jx_integer(node->node_id)); + jx_insert(event, jx_string("fn_args"), args); + jx_insert(event, jx_string("fn_kwargs"), jx_object(NULL)); + + char *infile_content = jx_print_string(event); + jx_delete(event); + + return infile_content; +} + +/** + * Find all parents in a specific depth of the node. + * @param node Reference to the node object. + * @param depth Reference to the depth. + * @return The list of parents. + */ +struct list *vine_node_find_parents_by_depth(struct vine_node *node, int depth) +{ + if (!node || depth < 0) { + return NULL; + } + + struct list *result = list_create(); + + struct set *visited = set_create(0); + find_parents_dfs(node, depth, result, visited); + set_delete(visited); + + return result; +} + +/** + * Perform a reverse BFS traversal to identify all ancestors of a given node + * whose outputs can be safely pruned. + * + * A parent node is considered "safe" if: + * 1. All of its child nodes are either: + * - already persisted (their outputs are stored in a reliable location), or + * - already marked as safely pruned. + * 2. None of its child nodes remain in an unsafe or incomplete state. + * + * This function starts from the given node and iteratively walks up the DAG, + * collecting all such "safe" ancestors into a set. Nodes that have already + * been marked as PRUNE_STATUS_SAFE are skipped early. + * + * The returned set contains all ancestors that can be safely pruned once the + * current node’s output has been persisted. + * + * @param start_node The node from which to begin the reverse search. + * @return A set of ancestor nodes that are safe to prune (excluding start_node). + */ +struct set *vine_node_find_safe_ancestors(struct vine_node *start_node) +{ + if (!start_node) { + return NULL; + } + + struct set *visited_nodes = set_create(0); + struct set *safe_ancestors = set_create(0); + + struct list *queue = list_create(); + + list_push_tail(queue, start_node); + set_insert(visited_nodes, start_node); + + while (list_size(queue) > 0) { + struct vine_node *current_node = list_pop_head(queue); + struct vine_node *parent_node; + + LIST_ITERATE(current_node->parents, parent_node) + { + if (set_lookup(visited_nodes, parent_node)) { + continue; + } + + set_insert(visited_nodes, parent_node); + + /* shortcut if this parent has already been marked as safely pruned */ + if (parent_node->prune_status == PRUNE_STATUS_SAFE) { + continue; + } + + /* check if all children of this parent are safe */ + int all_children_safe = 1; + struct vine_node *child_node; + LIST_ITERATE(parent_node->children, child_node) + { + /* shortcut if this child is part of the recovery subgraph */ + if (set_lookup(visited_nodes, child_node)) { + continue; + } + /* shortcut if this outside child is not persisted */ + if (!node_outfile_has_been_persisted(child_node)) { + all_children_safe = 0; + break; + } + /* shortcut if this outside child is unsafely pruned */ + if (child_node->prune_status == PRUNE_STATUS_UNSAFE) { + all_children_safe = 0; + break; + } + } + + if (all_children_safe) { + set_insert(safe_ancestors, parent_node); + list_push_tail(queue, parent_node); + } + } + } + + list_delete(queue); + set_delete(visited_nodes); + + return safe_ancestors; +} + +/** + * Print the info of the node. + * @param node Reference to the node object. + */ +void vine_node_debug_print(struct vine_node *node) +{ + if (!node) { + return; + } + + if (!node->task) { + debug(D_ERROR, "node %" PRIu64 " has no task", node->node_id); + return; + } + + debug(D_VINE, "---------------- Node Info ----------------"); + debug(D_VINE, "node_id: %" PRIu64, node->node_id); + debug(D_VINE, "task_id: %d", node->task->task_id); + debug(D_VINE, "depth: %d", node->depth); + debug(D_VINE, "height: %d", node->height); + debug(D_VINE, "prune_depth: %d", node->prune_depth); + + if (node->outfile_remote_name) { + debug(D_VINE, "outfile_remote_name: %s", node->outfile_remote_name); + } + + if (node->outfile) { + const char *type_str = "UNKNOWN"; + switch (node->outfile->type) { + case VINE_FILE: + type_str = "VINE_FILE"; + break; + case VINE_TEMP: + type_str = "VINE_TEMP"; + break; + case VINE_URL: + type_str = "VINE_URL"; + break; + case VINE_BUFFER: + type_str = "VINE_BUFFER"; + break; + case VINE_MINI_TASK: + type_str = "VINE_MINI_TASK"; + break; + } + debug(D_VINE, "outfile_type: %s", type_str); + debug(D_VINE, "outfile_cached_name: %s", node->outfile->cached_name ? node->outfile->cached_name : "(null)"); + } else { + debug(D_VINE, "outfile_type: SHARED_FILE_SYSTEM or none"); + } + + /* print parent and child node ids */ + char *parent_ids = NULL; + struct vine_node *p; + LIST_ITERATE(node->parents, p) + { + if (!parent_ids) { + parent_ids = string_format("%" PRIu64, p->node_id); + } else { + char *tmp = string_format("%s, %" PRIu64, parent_ids, p->node_id); + free(parent_ids); + parent_ids = tmp; + } + } + + char *child_ids = NULL; + struct vine_node *c; + LIST_ITERATE(node->children, c) + { + if (!child_ids) { + child_ids = string_format("%" PRIu64, c->node_id); + } else { + char *tmp = string_format("%s, %" PRIu64, child_ids, c->node_id); + free(child_ids); + child_ids = tmp; + } + } + + debug(D_VINE, "parents: %s", parent_ids ? parent_ids : "(none)"); + debug(D_VINE, "children: %s", child_ids ? child_ids : "(none)"); + + free(parent_ids); + free(child_ids); + + debug(D_VINE, "-------------------------------------------"); +} + +/** + * Delete the node and all of its associated resources. + * @param node Reference to the node object. + */ +void vine_node_delete(struct vine_node *node) +{ + if (!node) { + return; + } + + if (node->outfile_remote_name) { + free(node->outfile_remote_name); + } + + vine_task_delete(node->task); + node->task = NULL; + + if (node->infile) { + vine_file_delete(node->infile); + node->infile = NULL; + } + if (node->outfile) { + vine_file_delete(node->outfile); + node->outfile = NULL; + } + + list_delete(node->parents); + list_delete(node->children); + + if (node->pending_parents) { + set_delete(node->pending_parents); + } + free(node); +} \ No newline at end of file diff --git a/taskvine/src/graph/vinedag/vine_graph/vine_node.h b/taskvine/src/graph/vinedag/vine_graph/vine_node.h new file mode 100644 index 0000000000..9f01e959c0 --- /dev/null +++ b/taskvine/src/graph/vinedag/vine_graph/vine_node.h @@ -0,0 +1,115 @@ +#ifndef VINE_NODE_H +#define VINE_NODE_H + +#include + +#include "vine_task.h" +#include "hash_table.h" +#include "list.h" +#include "set.h" +#include "taskvine.h" + +/** The storage type of the node's output file. */ +typedef enum { + NODE_OUTFILE_TYPE_LOCAL = 0, /* Node-output file will be stored locally on the local staging directory */ + NODE_OUTFILE_TYPE_TEMP, /* Node-output file will be stored in the temporary node-local storage */ + NODE_OUTFILE_TYPE_SHARED_FILE_SYSTEM, /* Node-output file will be stored in the persistent shared file system */ +} node_outfile_type_t; + +/** The status of an output file of a node. */ +typedef enum { + PRUNE_STATUS_NOT_PRUNED = 0, + PRUNE_STATUS_SAFE, + PRUNE_STATUS_UNSAFE +} prune_status_t; + +/** The vine node object. */ +struct vine_node { + /* Identity */ + uint64_t node_id; /* Unique identifier assigned by the graph when the node is created. */ + int is_target; /* If true, the output of the node is retrieved when the task finishes. */ + + /* Task and files */ + struct vine_task *task; + struct vine_file *infile; + struct vine_file *outfile; + char *outfile_remote_name; + size_t outfile_size_bytes; + node_outfile_type_t outfile_type; + + /* Graph relationships */ + struct list *parents; + struct list *children; + + /* Execution and scheduling state */ + struct set *pending_parents; + int retry_attempts_left; + int completed; + prune_status_t prune_status; + + /* Structural metrics */ + int prune_depth; + int depth; + int height; + int upstream_subgraph_size; + int downstream_subgraph_size; + int fan_in; + int fan_out; + double heavy_score; + + /* Time metrics */ + timestamp_t critical_path_time; + timestamp_t time_spent_on_unlink_local_files; + timestamp_t time_spent_on_prune_ancestors_of_temp_node; + timestamp_t time_spent_on_prune_ancestors_of_persisted_node; + + timestamp_t submission_time; + timestamp_t scheduling_time; + timestamp_t commit_time; + timestamp_t execution_time; + timestamp_t retrieval_time; + timestamp_t postprocessing_time; +}; + +/** Create a new vine node. +@param node_id Unique node identifier supplied by the owning graph. +@return Newly allocated vine node instance. +*/ +struct vine_node *vine_node_create(uint64_t node_id); + +/** Create the task arguments for a vine node. +@param node Reference to the vine node. +@return The task arguments in JSON format: {"fn_args": [node_id], "fn_kwargs": {}}. +*/ +char *vine_node_construct_task_arguments(struct vine_node *node); + +/** Delete a vine node and release owned resources. +@param node Reference to the vine node. +*/ +void vine_node_delete(struct vine_node *node); + +/** Print information about a vine node. +@param node Reference to the vine node. +*/ +void vine_node_debug_print(struct vine_node *node); + +/** Find all safe ancestors of a vine node. +@param start_node Reference to the start node. +@return The set of safe ancestors. +*/ +struct set *vine_node_find_safe_ancestors(struct vine_node *start_node); + +/** Find all parents of a vine node at a specific depth. +@param node Reference to the node. +@param depth Reference to the depth. +@return The list of parents. +*/ +struct list *vine_node_find_parents_by_depth(struct vine_node *node, int depth); + +/** Update the critical path time of a vine node. +@param node Reference to the vine node. +@param execution_time Reference to the execution time of the node. +*/ +void vine_node_update_critical_path_time(struct vine_node *node, timestamp_t execution_time); + +#endif // VINE_NODE_H \ No newline at end of file diff --git a/taskvine/src/graph/vinedag/vinedag.py b/taskvine/src/graph/vinedag/vinedag.py new file mode 100644 index 0000000000..77975df3f5 --- /dev/null +++ b/taskvine/src/graph/vinedag/vinedag.py @@ -0,0 +1,314 @@ +# Copyright (C) 2025 The University of Notre Dame +# This software is distributed under the GNU General Public License. +# See the file COPYING for details. + +from ndcctools.taskvine import cvine +from ndcctools.taskvine.manager import Manager + +from ndcctools.taskvine.vinedag.context_graph.proxy_library import ProxyLibrary +from ndcctools.taskvine.vinedag.context_graph.proxy_functions import compute_single_key +from ndcctools.taskvine.vinedag.context_graph.core import ContextGraph, ContextGraphTaskResult +from ndcctools.taskvine.vinedag.vine_graph.vine_graph_client import VineGraphClient + +import cloudpickle +import os +import signal +import json + +try: + import dask +except ImportError: + dask = None + +try: + from dask.base import is_dask_collection +except ImportError: + is_dask_collection = None + +try: + import dask._task_spec as dts +except ImportError: + dts = None + + +def delete_all_files(root_dir): + """Clean the run-info template directory between runs so stale files never leak into a new DAG.""" + if not os.path.exists(root_dir): + return + for dirpath, dirnames, filenames in os.walk(root_dir): + for filename in filenames: + file_path = os.path.join(dirpath, filename) + try: + os.remove(file_path) + except FileNotFoundError: + print(f"Failed to delete file {file_path}") + + +# Nicely format terminal output when printing manager metadata. +def color_text(text, color_code): + """Render a colored string for the friendly status banners Vineyard prints at start-up.""" + return f"\033[{color_code}m{text}\033[0m" + + +# Flatten Dask collections into the dict-of-tasks structure the rest of the +# pipeline expects. VineDAG clients often hand us a dict like +# {"result": dask.delayed(...)}; we merge the underlying HighLevelGraphs so +# `ContextGraph` sees the same dict representation C does. +def dask_collections_to_task_dict(collection_dict): + """Merge user-facing Dask collections into the flattened task dict the ContextGraph expects.""" + assert is_dask_collection is not None + from dask.highlevelgraph import HighLevelGraph, ensure_dict + + if not isinstance(collection_dict, dict): + raise TypeError("Input must be a dict") + + for k, v in collection_dict.items(): + if not is_dask_collection(v): + raise TypeError(f"Input must be a dict of DaskCollection, but found {k} with type {type(v)}") + + if dts: + # the new Dask API + sub_hlgs = [v.dask for v in collection_dict.values()] + hlg = HighLevelGraph.merge(*sub_hlgs).to_dict() + else: + # the old Dask API + hlg = dask.base.collections_to_dsk(collection_dict.values()) + + return ensure_dict(hlg) + + +# Accept both plain dicts and Dask collections from callers. Most library users +# hand us `{key: delayed / value}` directly, while some experiments pass a +# fully-expanded legacy Dask dict. This helper normalises both cases so the rest +# of the pipeline only deals with `{task_key: task_expression}`. +def ensure_task_dict(collection_dict): + """Normalize user input (raw dict or Dask collection) into a plain `{task_key: expr}` mapping.""" + if is_dask_collection and any(is_dask_collection(v) for v in collection_dict.values()): + task_dict = dask_collections_to_task_dict(collection_dict) + else: + task_dict = collection_dict + + if dts: + return dts.convert_legacy_graph(task_dict) + else: + return task_dict + + +class GraphParams: + def __init__(self): + """Hold all tweakable knobs (manager-side, vine_graph-side, and misc).""" + # Manager-level knobs: fed into `Manager.tune(...)` before execution. + self.vine_manager_tuning_params = { + "worker-source-max-transfers": 100, + "max-retrievals": -1, + "prefer-dispatch": 1, + "transient-error-interval": 1, + "attempt-schedule-depth": 10000, + "temp-replica-count": 1, + "enforce-worker-eviction-interval": -1, + "balance-worker-disk-load": 0, + } + # VineGraph-level knobs: forwarded to the underlying vine graph via VineGraphClient. + self.vine_graph_tuning_params = { + "failure-injection-step-percent": -1, + "task-priority-mode": "largest-input-first", + "prune-depth": 1, + "output-dir": "./outputs", + "checkpoint-dir": "./checkpoints", + "checkpoint-fraction": 0, + "progress-bar-update-interval-sec": 0.1, + "time-metrics-filename": "time_metrics.csv", + "enable-debug-log": 1, + } + # Misc knobs used purely on the Python side (e.g., generate fake outputs). + self.other_params = { + "schedule": "worst", + "libcores": 16, + "failure-injection-step-percent": -1, + "extra-task-output-size-mb": [0, 0], + "extra-task-sleep-time": [0, 0], + } + + def print_params(self): + """Dump current knob values to stdout for debugging.""" + all_params = {**self.vine_manager_tuning_params, **self.vine_graph_tuning_params, **self.other_params} + print(json.dumps(all_params, indent=4)) + + def update_param(self, param_name, new_value): + """Update a single knob, falling back to manager-level if unknown.""" + if param_name in self.vine_manager_tuning_params: + self.vine_manager_tuning_params[param_name] = new_value + elif param_name in self.vine_graph_tuning_params: + self.vine_graph_tuning_params[param_name] = new_value + elif param_name in self.other_params: + self.other_params[param_name] = new_value + else: + self.vine_manager_tuning_params[param_name] = new_value + + def get_value_of(self, param_name): + """Helper so VineDAG can pull a knob value without caring where it lives.""" + if param_name in self.vine_manager_tuning_params: + return self.vine_manager_tuning_params[param_name] + elif param_name in self.vine_graph_tuning_params: + return self.vine_graph_tuning_params[param_name] + elif param_name in self.other_params: + return self.other_params[param_name] + else: + raise ValueError(f"Invalid param name: {param_name}") + + +class VineDAG(Manager): + def __init__(self, + *args, + **kwargs): + """Spin up a TaskVine manager that knows how to mirror a Python DAG into the C orchestration layer.""" + + # React to Ctrl+C so we can tear down the graphs cleanly. + signal.signal(signal.SIGINT, self._on_sigint) + + self.params = GraphParams() + + # Ensure run-info templates don't accumulate garbage between runs. + run_info_path = kwargs.get("run_info_path", None) + run_info_template = kwargs.get("run_info_template", None) + self.run_info_template_path = os.path.join(run_info_path, run_info_template) + if self.run_info_template_path: + delete_all_files(self.run_info_template_path) + + # Boot the underlying TaskVine manager. The TaskVine manager keeps alive until the vinedag object is destroyed + super().__init__(*args, **kwargs) + print(f"cvine = {cvine}") + self.runtime_directory = cvine.vine_get_runtime_directory(self._taskvine) + + print(f"=== Manager name: {color_text(self.name, 92)}") + print(f"=== Manager port: {color_text(self.port, 92)}") + print(f"=== Runtime directory: {color_text(self.runtime_directory, 92)}") + + def param(self, param_name): + """Convenience accessor so callers can read tuned parameters at runtime.""" + return self.params.get_value_of(param_name) + + def update_params(self, new_params): + """Apply a batch of overrides before constructing graphs. + + All parameter dictionaries—whether set via `update_params()` or passed + to `run(..., params={...})`—flow through here. We funnel each key into + the appropriate bucket (manager/vine_graph/misc). Subsequent runs can override + them by calling this again. + """ + assert isinstance(new_params, dict), "new_params must be a dict" + for k, new_v in new_params.items(): + self.params.update_param(k, new_v) + + def tune_manager(self): + """Push our manager-side tuning knobs into the C layer.""" + for k, v in self.params.vine_manager_tuning_params.items(): + try: + self.tune(k, v) + except Exception: + raise ValueError(f"Unrecognized parameter: {k}") + + def tune_vine_graph(self, vine_graph): + """Push VineGraph-specific tuning knobs before we build the graph.""" + for k, v in self.params.vine_graph_tuning_params.items(): + vine_graph.tune(k, str(v)) + + def build_context_graph(self, task_dict): + """Construct the Python-side DAG wrapper (ContextGraph).""" + context_graph = ContextGraph( + task_dict, + extra_task_output_size_mb=self.param("extra-task-output-size-mb"), + extra_task_sleep_time=self.param("extra-task-sleep-time") + ) + + return context_graph + + def build_vine_graph(self, context_graph, target_keys): + """Mirror the ContextGraph into VineGraph, preserving ordering and targets.""" + assert context_graph is not None, "ContextGraph must be built before building the VineGraph" + + vine_graph = VineGraphClient(self._taskvine) + + vine_graph.set_proxy_function(compute_single_key) + + # Tune both manager and vine_graph before we start adding nodes/edges. + self.tune_manager() + self.tune_vine_graph(vine_graph) + + topo_order = context_graph.get_topological_order() + # Build the cross-language mapping as we walk the topo order. + for k in topo_order: + node_id = vine_graph.add_node(k) + context_graph.ckey2vid[k] = node_id + context_graph.vid2ckey[node_id] = k + for pk in context_graph.parents_of[k]: + vine_graph.add_dependency(pk, k) + + # Now that every node is present, mark which ones are final outputs. + for k in target_keys: + vine_graph.set_target(k) + + vine_graph.compute_topology_metrics() + + return vine_graph + + def build_graphs(self, task_dict, target_keys): + """Create both the ContextGraph and its C counterpart, wiring outputs for later use.""" + # Build the logical (Python) DAG. + context_graph = self.build_context_graph(task_dict) + # Build the physical (C) DAG. + vine_graph = self.build_vine_graph(context_graph, target_keys) + + # Cross-fill the outfile locations so the runtime graph knows where to read/write. + for k in context_graph.ckey2vid: + outfile_remote_name = vine_graph.get_node_outfile_remote_name(k) + context_graph.outfile_remote_name[k] = outfile_remote_name + + return context_graph, vine_graph + + def create_proxy_library(self, context_graph, vine_graph, hoisting_modules, env_files): + """Package up the context_graph as a TaskVine library.""" + proxy_library = ProxyLibrary(self) + proxy_library.add_hoisting_modules(hoisting_modules) + proxy_library.add_env_files(env_files) + proxy_library.set_context_loader(ContextGraph.context_loader_func, context_loader_args=[cloudpickle.dumps(context_graph)]) + proxy_library.set_libcores(self.param("libcores")) + proxy_library.set_name(vine_graph.get_proxy_library_name()) + + return proxy_library + + def run(self, collection_dict, target_keys=[], params={}, hoisting_modules=[], env_files={}): + """High-level entry point: normalise input, build graphs, ship the library, execute, and return results.""" + # first update the params so that they can be used for the following construction + self.update_params(params) + + task_dict = ensure_task_dict(collection_dict) + + # Build both the Python DAG and its C mirror. + context_graph, vine_graph = self.build_graphs(task_dict, target_keys) + + # Ship the execution context to workers via a proxy library. + proxy_library = self.create_proxy_library(context_graph, vine_graph, hoisting_modules, env_files) + proxy_library.install() + + print(f"=== Library serialized size: {color_text(proxy_library.get_context_size(), 92)} MB") + + # Kick off execution on the C side. + vine_graph.execute() + + # Tear down once we're done so successive runs start clean. + proxy_library.uninstall() + + # Delete the C graph immediately so its lifetime matches the run. + vine_graph.delete() + + # Load any requested target outputs back into Python land. + results = {} + for k in target_keys: + outfile_path = os.path.join(self.param("output-dir"), context_graph.outfile_remote_name[k]) + results[k] = ContextGraphTaskResult.load_from_path(outfile_path) + return results + + def _on_sigint(self, signum, frame): + """SIGINT handler that delegates to Manager cleanup so workers are released promptly.""" + self.__del__() diff --git a/taskvine/src/manager/Makefile b/taskvine/src/manager/Makefile index a036e9bd64..5c56c9f09b 100644 --- a/taskvine/src/manager/Makefile +++ b/taskvine/src/manager/Makefile @@ -28,7 +28,8 @@ SOURCES = \ vine_file_replica_table.c \ vine_fair.c \ vine_runtime_dir.c \ - vine_task_groups.c + vine_task_groups.c \ + vine_temp.c PUBLIC_HEADERS = taskvine.h @@ -39,6 +40,7 @@ TARGETS = $(LIBRARIES) all: $(TARGETS) + libtaskvine.a: $(OBJECTS) install: all diff --git a/taskvine/src/manager/stnPTyT6 b/taskvine/src/manager/stnPTyT6 new file mode 100644 index 0000000000..8e72ad4194 Binary files /dev/null and b/taskvine/src/manager/stnPTyT6 differ diff --git a/taskvine/src/manager/taskvine.h b/taskvine/src/manager/taskvine.h index 16152b7102..ebce1899a2 100644 --- a/taskvine/src/manager/taskvine.h +++ b/taskvine/src/manager/taskvine.h @@ -149,7 +149,6 @@ typedef enum { VINE_MINI_TASK, /**< A file obtained by executing a Unix command line. */ } vine_file_type_t; - /** Statistics describing a manager. */ struct vine_stats { /* Stats for the current state of workers: */ @@ -929,8 +928,9 @@ The given file or directory object is deleted from all worker's caches, but is still available on the manager's site, and can be recovered by submitting a recovery task. @param m A manager object @param f Any file object. +@return The number of replicas pruned. */ -void vine_prune_file(struct vine_manager *m, struct vine_file *f); +int vine_prune_file(struct vine_manager *m, struct vine_file *f); //@} @@ -1117,6 +1117,14 @@ int vine_enable_peer_transfers(struct vine_manager *m); /** Disable taskvine peer transfers to be scheduled by the manager **/ int vine_disable_peer_transfers(struct vine_manager *m); +/** Enable recovery tasks to be returned by vine_wait. +By default, recovery tasks are handled internally by the manager. **/ +int vine_enable_return_recovery_tasks(struct vine_manager *m); + +/** Disable recovery tasks from being returned by vine_wait. +Recovery tasks will be handled internally by the manager. **/ +int vine_disable_return_recovery_tasks(struct vine_manager *m); + /** When enabled, resources to tasks in are assigned in proportion to the size of the worker. If a resource is specified (e.g. with @ref vine_task_set_cores), proportional resources never go below explicit specifications. This mode is most @@ -1528,6 +1536,12 @@ void vine_counters_print(); */ char *vine_version_string(); +/** Returns the runtime directory +@param m Reference to the current manager object. +@return A string. +*/ +char *vine_get_runtime_directory(struct vine_manager *m); + /** Returns path relative to the logs runtime directory @param m Reference to the current manager object. @param path Target filename. diff --git a/taskvine/src/manager/vine_file_replica_table.c b/taskvine/src/manager/vine_file_replica_table.c index d6ba403f5b..7619a01833 100644 --- a/taskvine/src/manager/vine_file_replica_table.c +++ b/taskvine/src/manager/vine_file_replica_table.c @@ -25,7 +25,8 @@ See the file COPYING for details. int vine_file_replica_table_insert(struct vine_manager *m, struct vine_worker_info *w, const char *cachename, struct vine_file_replica *replica) { if (hash_table_lookup(w->current_files, cachename)) { - return 0; + // delete the previous replcia because the replica's size might have changed + vine_file_replica_table_remove(m, w, cachename); } double prev_available = w->resources->disk.total - BYTES_TO_MEGABYTES(w->inuse_cache); diff --git a/taskvine/src/manager/vine_manager.c b/taskvine/src/manager/vine_manager.c index ca72d257d9..e6ff9a3660 100644 --- a/taskvine/src/manager/vine_manager.c +++ b/taskvine/src/manager/vine_manager.c @@ -29,6 +29,7 @@ See the file COPYING for details. #include "vine_taskgraph_log.h" #include "vine_txn_log.h" #include "vine_worker_info.h" +#include "vine_temp.h" #include "address.h" #include "buffer.h" @@ -146,6 +147,7 @@ static vine_msg_code_t handle_manager_status(struct vine_manager *q, struct vine static vine_msg_code_t handle_resources(struct vine_manager *q, struct vine_worker_info *w, time_t stoptime); static vine_msg_code_t handle_feature(struct vine_manager *q, struct vine_worker_info *w, const char *line); static void handle_library_update(struct vine_manager *q, struct vine_worker_info *w, const char *line); +static int receive_tasks_from_worker(struct vine_manager *q, struct vine_worker_info *w, int count_received_so_far); static struct jx *manager_to_jx(struct vine_manager *q); static struct jx *manager_lean_to_jx(struct vine_manager *q); @@ -165,10 +167,12 @@ static int vine_manager_check_inputs_available(struct vine_manager *q, struct vi static void vine_manager_consider_recovery_task(struct vine_manager *q, struct vine_file *lost_file, struct vine_task *rt); static void delete_uncacheable_files(struct vine_manager *q, struct vine_worker_info *w, struct vine_task *t); -static int delete_worker_file(struct vine_manager *q, struct vine_worker_info *w, const char *filename, vine_cache_level_t cache_level, vine_cache_level_t delete_upto_level); static int release_worker(struct vine_manager *q, struct vine_worker_info *w); struct vine_task *send_library_to_worker(struct vine_manager *q, struct vine_worker_info *w, const char *name); +static void push_task_to_ready_tasks(struct vine_manager *q, struct vine_task *t); + +static void clean_redundant_replicas(struct vine_manager *q, struct vine_file *f); /* Return the number of workers matching a given type: WORKER, STATUS, etc */ @@ -418,9 +422,19 @@ static vine_msg_code_t handle_cache_update(struct vine_manager *q, struct vine_w f->state = VINE_FILE_STATE_CREATED; f->size = size; - /* And if the file is a newly created temporary, replicate as needed. */ - if (f->type == VINE_TEMP && *id == 'X' && q->temp_replica_count > 1) { - hash_table_insert(q->temp_files_to_replicate, f->cached_name, NULL); + /* If the replica's type was a URL, it means the manager expected the destination worker to download it + * from elsewhere. Now that it's physically present, we can resolve its type back to the original */ + if (replica->type == VINE_URL) { + replica->type = f->type; + } + + /* If a TEMP file, replicate as needed. */ + if (f->type == VINE_TEMP) { + vine_temp_replicate_file_later(q, f); + + if (q->balance_worker_disk_load) { + clean_redundant_replicas(q, f); + } } } } @@ -476,6 +490,9 @@ static vine_msg_code_t handle_cache_invalid(struct vine_manager *q, struct vine_ w->last_failure_time = timestamp_get(); } + /* If the creation failed, we may want to backup the file somewhere else. */ + vine_temp_handle_file_lost(q, cachename); + /* Successfully processed this message. */ return VINE_MSG_PROCESSED; } else { @@ -572,6 +589,8 @@ static vine_result_code_t get_completion_result(struct vine_manager *q, struct v return VINE_SUCCESS; } + t->time_when_get_result_start = timestamp_get(); + if (task_status != VINE_RESULT_SUCCESS) { w->last_failure_time = timestamp_get(); t->time_when_last_failure = w->last_failure_time; @@ -650,6 +669,17 @@ static vine_result_code_t get_completion_result(struct vine_manager *q, struct v itable_remove(q->running_table, t->task_id); vine_task_set_result(t, task_status); + /* Clean redundant replicas for the inputs */ + struct vine_mount *input_mount; + LIST_ITERATE(t->input_mounts, input_mount) + { + if (input_mount->file && input_mount->file->type == VINE_TEMP) { + clean_redundant_replicas(q, input_mount->file); + } + } + + t->time_when_get_result_end = timestamp_get(); + return VINE_SUCCESS; } @@ -662,6 +692,7 @@ static vine_msg_code_t handle_complete(struct vine_manager *q, struct vine_worke { vine_result_code_t result = get_completion_result(q, w, line); if (result == VINE_SUCCESS) { + receive_tasks_from_worker(q, w, 0); return VINE_MSG_PROCESSED; } return VINE_MSG_NOT_PROCESSED; @@ -935,8 +966,74 @@ static void cleanup_worker_files(struct vine_manager *q, struct vine_worker_info hash_table_free_keys_array(cachenames); } -/* -This function enforces a target worker eviction rate (1 every X seconds). +/** Check if a file is busy by checking if it is an input file of any task. */ +static int is_file_busy(struct vine_manager *q, struct vine_worker_info *w, struct vine_file *f) +{ + if (!q || !w || !f) { + return 0; + } + + uint64_t task_id; + struct vine_task *task; + ITABLE_ITERATE(w->current_tasks, task_id, task) + { + struct vine_mount *input_mount; + LIST_ITERATE(task->input_mounts, input_mount) + { + if (f == input_mount->file) { + return 1; + } + } + } + + return 0; +} + +/** Evict a random worker to simulate a worker failure. */ +int evict_random_worker(struct vine_manager *q) +{ + if (!q) { + return 0; + } + + if (hash_table_size(q->worker_table) == 0) { + return 0; + } + + int removed = 0; + + /* collect removable workers */ + struct list *candidates_list = list_create(); + char *key; + struct vine_worker_info *w; + HASH_TABLE_ITERATE(q->worker_table, key, w) + { + list_push_tail(candidates_list, w); + } + + /* release a random worker if any */ + int random_number = random_int64(); + if (random_number < 0) { + random_number = -random_number; + } + int index = (int)(random_number % list_size(candidates_list)); + int i = 0; + while ((w = list_pop_head(candidates_list))) { + if (i++ == index) { + /* evict this worker */ + debug(D_VINE | D_NOTICE, "Intentionally evicting worker %s", w->hostname); + release_worker(q, w); + removed = 1; + break; + } + } + + list_delete(candidates_list); + return removed; +} + +/** +Enforces a target worker eviction rate (1 every X seconds). If the observed eviction interval is shorter than the desired one, we randomly evict one worker to keep the eviction pace aligned with the target. This includes all types of removals, whether graceful or due to failures. @@ -973,32 +1070,150 @@ static int enforce_worker_eviction_interval(struct vine_manager *q) return 0; } - /* collect removable workers */ - struct list *candidates_list = list_create(); + /* evict a random worker if any */ + return evict_random_worker(q); +} + +/** Get the available disk space in bytes for a worker. */ +int64_t get_worker_available_disk_bytes(struct vine_worker_info *w) +{ + if (!w || !w->resources) { + return 0; + } + + return (int64_t)MEGABYTES_TO_BYTES(w->resources->disk.total) - w->inuse_cache; +} + +/** Clean redundant replicas of a temporary file. */ +static void clean_redundant_replicas(struct vine_manager *q, struct vine_file *f) +{ + if (!f || f->type != VINE_TEMP) { + return; + } + + // remove excess replicas of temporary files + struct set *source_workers = hash_table_lookup(q->file_worker_table, f->cached_name); + if (!source_workers) { + // no surprise - a cache-update may trigger a file deletion! + return; + } + int replicas_to_remove = set_size(source_workers) - q->temp_replica_count; + if (replicas_to_remove <= 0) { + return; + } + // note that this replica can be a source to a peer transfer, if this is unlinked, + // a corresponding transfer may fail and result in a forsaken task + // therefore, we need to wait until all replicas are ready + if (vine_file_replica_table_count_replicas(q, f->cached_name, VINE_FILE_REPLICA_STATE_READY) != set_size(source_workers)) { + return; + } + + struct priority_queue *offload_from_workers = priority_queue_create(0); + + struct vine_worker_info *source_worker = NULL; + SET_ITERATE(source_workers, source_worker) + { + // workers with more used disk are prioritized for removing + if (is_file_busy(q, source_worker, f)) { + continue; + } + + priority_queue_push(offload_from_workers, source_worker, source_worker->inuse_cache); + } + + struct vine_worker_info *offload_from_worker = NULL; + while (replicas_to_remove-- > 0 && (offload_from_worker = priority_queue_pop(offload_from_workers))) { + delete_worker_file(q, offload_from_worker, f->cached_name, 0, 0); + } + priority_queue_delete(offload_from_workers); + + return; +} + +/* Shift disk load between workers to balance the disk usage. */ +static void rebalance_worker_disk_usage(struct vine_manager *q) +{ + if (!q) { + return; + } + + struct vine_worker_info *worker_with_min_disk_usage = NULL; + struct vine_worker_info *worker_with_max_disk_usage = NULL; + char *key; struct vine_worker_info *w; HASH_TABLE_ITERATE(q->worker_table, key, w) { - if (w->type != VINE_WORKER_TYPE_WORKER) { + if (!w->transfer_port_active) { continue; } - list_push_tail(candidates_list, w); + if (w->draining) { + continue; + } + if (!w->resources) { + continue; + } + if (w->resources->tag < 0 || w->resources->disk.total < 1) { + continue; + } + if (!worker_with_min_disk_usage || w->inuse_cache < worker_with_min_disk_usage->inuse_cache) { + if (w->incoming_xfer_counter < q->worker_source_max_transfers) { + worker_with_min_disk_usage = w; + } + } + if (!worker_with_max_disk_usage || w->inuse_cache > worker_with_max_disk_usage->inuse_cache) { + if (w->outgoing_xfer_counter < q->worker_source_max_transfers) { + worker_with_max_disk_usage = w; + } + } } - /* release a random worker if any */ - int index = (int)(random_int64() % list_size(candidates_list)); - int i = 0; - while ((w = list_pop_head(candidates_list))) { - if (i++ == index) { - /* evict this worker */ - debug(D_VINE | D_NOTICE, "Intentionally evicting worker %s", w->hostname); - release_worker(q, w); + if (!worker_with_min_disk_usage || !worker_with_max_disk_usage || worker_with_min_disk_usage == worker_with_max_disk_usage) { + return; + } + + int64_t min_inuse_cache = worker_with_min_disk_usage->inuse_cache; + int64_t max_inuse_cache = worker_with_max_disk_usage->inuse_cache; + + if (min_inuse_cache * 1.2 >= max_inuse_cache) { + return; + } + + if (max_inuse_cache <= q->peak_used_cache) { + return; + } + q->peak_used_cache = max_inuse_cache; + + int64_t bytes_to_offload = (int64_t)((max_inuse_cache - min_inuse_cache) / 2); + + char *cachename; + struct vine_file_replica *replica; + HASH_TABLE_ITERATE(worker_with_max_disk_usage->current_files, cachename, replica) + { + if (replica->type != VINE_TEMP) { + continue; + } + struct vine_file *f = hash_table_lookup(q->file_table, cachename); + if (!f) { + continue; + } + if (vine_file_replica_table_lookup(worker_with_min_disk_usage, cachename)) { + continue; + } + + vine_temp_start_peer_transfer(q, f, worker_with_max_disk_usage, worker_with_min_disk_usage); + bytes_to_offload -= replica->size; + if (bytes_to_offload <= 0) { break; } - } - list_delete(candidates_list); - return 1; + if (worker_with_min_disk_usage->incoming_xfer_counter >= q->worker_source_max_transfers) { + break; + } + if (worker_with_max_disk_usage->outgoing_xfer_counter >= q->worker_source_max_transfers) { + break; + } + } } /* Remove all tasks and other associated state from a given worker. */ @@ -1036,85 +1251,6 @@ static void cleanup_worker(struct vine_manager *q, struct vine_worker_info *w) cleanup_worker_files(q, w); } -/* Start replicating files that may need replication */ -static int consider_tempfile_replications(struct vine_manager *q) -{ - if (hash_table_size(q->temp_files_to_replicate) <= 0) { - return 0; - } - - char *cached_name = NULL; - void *empty_val = NULL; - int total_replication_request_sent = 0; - - static char key_start[PATH_MAX] = "random init"; - int iter_control; - int iter_count_var; - - struct list *to_remove = list_create(); - - HASH_TABLE_ITERATE_FROM_KEY(q->temp_files_to_replicate, iter_control, iter_count_var, key_start, cached_name, empty_val) - { - struct vine_file *f = hash_table_lookup(q->file_table, cached_name); - - if (!f) { - continue; - } - - /* are there any available source workers? */ - struct set *source_workers = hash_table_lookup(q->file_worker_table, f->cached_name); - if (!source_workers) { - /* If no source workers found, it indicates that the file doesn't exist, either pruned or lost. - Because a pruned file is removed from the recovery queue, so it definitely indicates that the file is lost. */ - if (q->transfer_temps_recovery && file_needs_recovery(q, f)) { - vine_manager_consider_recovery_task(q, f, f->recovery_task); - } - list_push_tail(to_remove, xxstrdup(f->cached_name)); - continue; - } - - /* at least one source is able to transfer? */ - int has_valid_source = 0; - struct vine_worker_info *s; - SET_ITERATE(source_workers, s) - { - if (s->transfer_port_active && s->outgoing_xfer_counter < q->worker_source_max_transfers && !s->draining) { - has_valid_source = 1; - break; - } - } - if (!has_valid_source) { - continue; - } - - /* has this file been fully replicated? */ - int nsource_workers = set_size(source_workers); - int to_find = MIN(q->temp_replica_count - nsource_workers, q->transfer_replica_per_cycle); - if (to_find <= 0) { - list_push_tail(to_remove, xxstrdup(f->cached_name)); - continue; - } - - // debug(D_VINE, "Found %d workers holding %s, %d replicas needed", nsource_workers, f->cached_name, to_find); - - int round_replication_request_sent = vine_file_replica_table_replicate(q, f, source_workers, to_find); - total_replication_request_sent += round_replication_request_sent; - - if (total_replication_request_sent >= q->attempt_schedule_depth) { - break; - } - } - - while ((cached_name = list_pop_head(to_remove))) { - hash_table_remove(q->temp_files_to_replicate, cached_name); - free(cached_name); - } - - list_delete(to_remove); - - return total_replication_request_sent; -} - /* Insert into hashtable temp files that may need replication. */ static void recall_worker_lost_temp_files(struct vine_manager *q, struct vine_worker_info *w) @@ -1127,11 +1263,7 @@ static void recall_worker_lost_temp_files(struct vine_manager *q, struct vine_wo // Iterate over files we want might want to recover HASH_TABLE_ITERATE(w->current_files, cached_name, info) { - struct vine_file *f = hash_table_lookup(q->file_table, cached_name); - - if (f && f->type == VINE_TEMP) { - hash_table_insert(q->temp_files_to_replicate, cached_name, NULL); - } + vine_temp_handle_file_lost(q, cached_name); } } @@ -1243,7 +1375,7 @@ static void add_worker(struct vine_manager *q) /* Delete a single file on a remote worker except those with greater delete_upto_level cache level */ -static int delete_worker_file(struct vine_manager *q, struct vine_worker_info *w, const char *filename, vine_cache_level_t cache_level, vine_cache_level_t delete_upto_level) +int delete_worker_file(struct vine_manager *q, struct vine_worker_info *w, const char *filename, vine_cache_level_t cache_level, vine_cache_level_t delete_upto_level) { if (cache_level <= delete_upto_level) { process_replica_on_event(q, w, filename, VINE_FILE_REPLICA_STATE_TRANSITION_EVENT_UNLINK); @@ -1503,82 +1635,6 @@ static int fetch_outputs_from_worker(struct vine_manager *q, struct vine_worker_ return 1; } -/* -Consider the set of tasks that are waiting but not running. -Cancel those that cannot run for unfixable policy reasons, -such as exceeded the absolute end time, no library task available, etc. -This is done in a separate iteration outside of scheduling -to avoid the cost of these checks in the critical path. -*/ - -static int expire_waiting_tasks(struct vine_manager *q) -{ - struct vine_task *t; - int t_idx; - int expired = 0; - - /* Measure the current time once for the whole iteration. */ - double current_time = timestamp_get() / ONE_SECOND; - - /* Only work through the queue up to iter_depth. */ - int iter_count = 0; - int iter_depth = MIN(priority_queue_size(q->ready_tasks), q->attempt_schedule_depth); - - PRIORITY_QUEUE_STATIC_ITERATE(q->ready_tasks, t_idx, t, iter_count, iter_depth) - { - /* In this loop, use VINE_RESULT_SUCCESS as an indication of "still ok to run". */ - vine_result_t result = VINE_RESULT_SUCCESS; - - /* Consider each of the possible task expiration reasons. */ - - if (t->resources_requested->end > 0 && t->resources_requested->end <= current_time) { - debug(D_VINE, "task %d has exceeded its end time", t->task_id); - result = VINE_RESULT_MAX_END_TIME; - } else if (t->needs_library && !hash_table_lookup(q->library_templates, t->needs_library)) { - debug(D_VINE, "task %d does not match any submitted library named \"%s\"", t->task_id, t->needs_library); - result = VINE_RESULT_MISSING_LIBRARY; - } - - /* If any of the reasons fired, then expire the task and put in the retrieved queue. */ - if (result != VINE_RESULT_SUCCESS) { - vine_task_set_result(t, result); - priority_queue_remove(q->ready_tasks, t_idx); - change_task_state(q, t, VINE_TASK_RETRIEVED); - expired++; - } - } - - /* Return the number of tasks expired. */ - return expired; -} - -/* -Consider the set of tasks that are waiting with strict inputs -Terminate those to which no such worker exists. -*/ - -static int enforce_waiting_fixed_locations(struct vine_manager *q) -{ - int t_idx; - struct vine_task *t; - int terminated = 0; - - int iter_count = 0; - int iter_depth = priority_queue_size(q->ready_tasks); - - PRIORITY_QUEUE_BASE_ITERATE(q->ready_tasks, t_idx, t, iter_count, iter_depth) - { - if (t->has_fixed_locations && !vine_schedule_check_fixed_location(q, t)) { - vine_task_set_result(t, VINE_RESULT_FIXED_LOCATION_MISSING); - change_task_state(q, t, VINE_TASK_RETRIEVED); - priority_queue_remove(q->ready_tasks, t_idx); - terminated++; - } - } - - return terminated; -} - /* This function handles app-level failures. It remove the task from WQ and marks the task as complete so it is returned to the application. @@ -3067,12 +3123,12 @@ static void find_max_worker(struct vine_manager *q) * are not counted towards the resources in use and will be killed if needed. */ static void kill_empty_libraries_on_worker(struct vine_manager *q, struct vine_worker_info *w, struct vine_task *t) { - uint64_t task_id; - struct vine_task *task; - ITABLE_ITERATE(w->current_tasks, task_id, task) + uint64_t libtask_id; + struct vine_task *libtask; + ITABLE_ITERATE(w->current_libraries, libtask_id, libtask) { - if (task->provides_library && task->function_slots_inuse == 0 && (!t->needs_library || strcmp(t->needs_library, task->provides_library))) { - vine_cancel_by_task_id(q, task->task_id); + if (libtask->function_slots_inuse == 0 && (!t->needs_library || strcmp(t->needs_library, libtask->provides_library))) { + vine_cancel_by_task_id(q, libtask_id); } } } @@ -3519,7 +3575,7 @@ static void vine_manager_consider_recovery_task(struct vine_manager *q, struct v case VINE_TASK_INITIAL: /* The recovery task has never been run, so submit it now. */ vine_submit(q, rt); - notice(D_VINE, "Submitted recovery task %d (%s) to re-create lost temporary file %s.", rt->task_id, rt->command_line, lost_file->cached_name); + debug(D_VINE, "Submitted recovery task %d (%s) to re-create lost temporary file %s.", rt->task_id, rt->command_line, lost_file->cached_name); break; case VINE_TASK_READY: case VINE_TASK_RUNNING: @@ -3533,7 +3589,7 @@ static void vine_manager_consider_recovery_task(struct vine_manager *q, struct v * here. */ vine_task_reset(rt); vine_submit(q, rt); - notice(D_VINE, "Submitted recovery task %d (%s) to re-create lost temporary file %s.", rt->task_id, rt->command_line, lost_file->cached_name); + debug(D_VINE, "Submitted recovery task %d (%s) to re-create lost temporary file %s.", rt->task_id, rt->command_line, lost_file->cached_name); break; } } @@ -3561,6 +3617,7 @@ static int vine_manager_check_inputs_available(struct vine_manager *q, struct vi all_available = 0; } } + return all_available; } @@ -3613,6 +3670,58 @@ int consider_task(struct vine_manager *q, struct vine_task *t) return 1; } +/* Rotate pending tasks to the ready queue if they are runnable. */ +static int rotate_pending_tasks(struct vine_manager *q) +{ + if (list_size(q->pending_tasks) == 0) { + return 0; + } + + int runnable_tasks = 0; + int tasks_considered = 0; + int tasks_to_consider = MIN(list_size(q->pending_tasks), q->attempt_schedule_depth); + struct vine_task *t = NULL; + + double current_time = timestamp_get() / ONE_SECOND; + + while (tasks_considered++ < tasks_to_consider) { + t = list_pop_head(q->pending_tasks); + if (!t) { + break; + } + + /* first check if the task has exceeded its end time or does not match any submitted library */ + /* If any of the reasons fired, then expire the task and put in the retrieved queue. */ + if (t->resources_requested->end > 0 && t->resources_requested->end <= current_time) { + debug(D_VINE, "task %d has exceeded its end time", t->task_id); + vine_task_set_result(t, VINE_RESULT_MAX_END_TIME); + change_task_state(q, t, VINE_TASK_RETRIEVED); + continue; + } + if (t->needs_library && !hash_table_lookup(q->library_templates, t->needs_library)) { + debug(D_VINE, "task %d does not match any submitted library named \"%s\"", t->task_id, t->needs_library); + vine_task_set_result(t, VINE_RESULT_MISSING_LIBRARY); + change_task_state(q, t, VINE_TASK_RETRIEVED); + continue; + } + if (q->fixed_location_in_queue && t->has_fixed_locations && !vine_schedule_check_fixed_location(q, t)) { + debug(D_VINE, "Missing fixed_location dependencies for task: %d", t->task_id); + vine_task_set_result(t, VINE_RESULT_FIXED_LOCATION_MISSING); + change_task_state(q, t, VINE_TASK_RETRIEVED); + continue; + } + + if (consider_task(q, t)) { + push_task_to_ready_tasks(q, t); + runnable_tasks++; + } else { + list_push_tail(q->pending_tasks, t); + } + } + + return runnable_tasks; +} + /* Advance the state of the system by selecting one task available to run, finding the best worker for that task, and then committing @@ -3621,82 +3730,95 @@ the task to the worker. static int send_one_task(struct vine_manager *q, int *tasks_ready_left_to_consider) { - int t_idx; + /* return early if no committable cores */ + int committable_cores = vine_schedule_count_committable_cores(q); + if (committable_cores == 0) { + return 0; + } + + /* rotate pending tasks before dispatching any tasks */ + rotate_pending_tasks(q); + + int committed_tasks = 0; + int tasks_considered = 0; + int tasks_to_consider = MIN(priority_queue_size(q->ready_tasks), q->attempt_schedule_depth); + + /* temporarily skipped tasks that are runnable but cannot fit on any current worker */ + struct list *skipped_tasks = list_create(); + struct vine_task *t; - int iter_count = 0; - int iter_depth = MIN(priority_queue_size(q->ready_tasks), q->attempt_schedule_depth); - - // Iterate over the ready tasks by priority. - // The first time we arrive here, the task with the highest priority is considered. However, there may be various reasons - // that this particular task is not eligible to run, such as: 1) the task requires more resources than the workers have; - // 2) the task requires input files that are not available; 3) the task failed recently; etc. (check consider_task function) - // Therefore, we may permit occasional skips of the highest priority task, and consider the next one in the queue. Similarly, - // other tasks may be skipped, too, until we find a task that is able to run. - // For a priority queue, iterating over tasks by priority is expensive, as it requires a full sort of the queue. Therefore, - // we simply iterate by numerical index if the task at the top is unable to run, and reset the cursor to the top if events - // that may enable tasks prior to the current cursor to run occur. Specifically, the following events should trigger a reset: - // 1. Task retrieval from worker (resources released or inputs available) - // 2. New worker connection (more resources available) - // 3. Delete/Insert an element prior/equal to the rotate cursor (tasks prior to the current cursor changed) - // 1 and 2 are explicitly handled by the manager where calls priority_queue_rotate_reset, while 3 is implicitly handled by - // the priority queue data structure where also invokes priority_queue_rotate_reset. - PRIORITY_QUEUE_ROTATE_ITERATE(q->ready_tasks, t_idx, t, iter_count, iter_depth) - { - *tasks_ready_left_to_consider -= 1; + while (tasks_considered < tasks_to_consider) { + t = priority_queue_pop(q->ready_tasks); + if (!t) { + break; + } + tasks_considered++; + + t->time_when_scheduling_start = timestamp_get(); + /* this task is not runnable at all, put it back in the pending queue */ if (!consider_task(q, t)) { + list_push_tail(q->pending_tasks, t); continue; } - // Find the best worker for the task - q->stats_measure->time_scheduling = timestamp_get(); + /* select a worker for the task */ struct vine_worker_info *w = vine_schedule_task_to_worker(q, t); - q->stats->time_scheduling += timestamp_get() - q->stats_measure->time_scheduling; - if (w) { - priority_queue_remove(q->ready_tasks, t_idx); + t->time_when_scheduling_end = timestamp_get(); - // do not continue if this worker is running a group task - if (q->task_groups_enabled) { - struct vine_task *it; - uint64_t taskid; - ITABLE_ITERATE(w->current_tasks, taskid, it) - { - if (it->group_id) { - return 0; - } - } - } + /* task is runnable but no worker is fit, silently skip it */ + if (!w) { + list_push_tail(skipped_tasks, t); + continue; + } - vine_result_code_t result; - if (q->task_groups_enabled) { - result = commit_task_group_to_worker(q, w, t); - } else { - result = commit_task_to_worker(q, w, t); - } + /* commit the task to the worker */ + vine_result_code_t result; + if (q->task_groups_enabled) { + result = commit_task_group_to_worker(q, w, t); + } else { + result = commit_task_to_worker(q, w, t); + } - switch (result) { - case VINE_SUCCESS: - /* return on successful commit. */ - return 1; - break; - case VINE_APP_FAILURE: - case VINE_WORKER_FAILURE: - /* failed to dispatch, commit put the task back in the right place. */ - break; - case VINE_MGR_FAILURE: - /* special case, commit had a chained failure. */ - priority_queue_push(q->ready_tasks, t, t->priority); - break; - case VINE_END_OF_LIST: - /* shouldn't happen, keep going */ - break; - } + switch (result) { + case VINE_SUCCESS: + committed_tasks++; + break; + case VINE_APP_FAILURE: + case VINE_WORKER_FAILURE: + /* failed to dispatch, commit put the task back in the right place. */ + break; + case VINE_MGR_FAILURE: + /* special case, commit had a chained failure. */ + debug(D_VINE, "Special case, failed to commit task %d to worker %s", t->task_id, w->hostname); + list_push_tail(q->pending_tasks, t); + break; + case VINE_END_OF_LIST: + /* shouldn't happen, keep going */ + break; + } + + /* continue dispatching tasks if q->prefer_dispatch is set */ + if (q->prefer_dispatch && committed_tasks < committable_cores) { + continue; + } + + /* stop when q->prefer_dispatch is not set and at least one task has been committed, + * or when it is set and all committable cores have been used */ + if (committed_tasks > 0) { + break; } } - return 0; + /* put back all tasks that were skipped */ + while ((t = list_pop_head(skipped_tasks))) { + push_task_to_ready_tasks(q, t); + } + list_delete(skipped_tasks); + + return committed_tasks; } /* @@ -4125,6 +4247,7 @@ struct vine_manager *vine_ssl_create(int port, const char *key, const char *cert q->next_task_id = 1; q->fixed_location_in_queue = 0; + q->pending_tasks = list_create(); q->ready_tasks = priority_queue_create(0); q->running_table = itable_create(0); q->waiting_retrieval_list = list_create(); @@ -4135,7 +4258,7 @@ struct vine_manager *vine_ssl_create(int port, const char *key, const char *cert q->worker_table = hash_table_create(0, 0); q->file_worker_table = hash_table_create(0, 0); - q->temp_files_to_replicate = hash_table_create(0, 0); + q->temp_files_to_replicate = priority_queue_create(0); q->worker_blocklist = hash_table_create(0, 0); q->file_table = hash_table_create(0, 0); @@ -4244,6 +4367,13 @@ struct vine_manager *vine_ssl_create(int port, const char *key, const char *cert q->enforce_worker_eviction_interval = 0; q->time_start_worker_eviction = 0; + q->return_recovery_tasks = 0; + q->num_submitted_recovery_tasks = 0; + q->balance_worker_disk_load = 0; + q->when_last_offloaded = 0; + q->peak_used_cache = 0; + q->shutting_down = 0; + if ((envstring = getenv("VINE_BANDWIDTH"))) { q->bandwidth_limit = string_metric_parse(envstring); if (q->bandwidth_limit < 0) { @@ -4320,6 +4450,20 @@ int vine_disable_peer_transfers(struct vine_manager *q) return 1; } +int vine_enable_return_recovery_tasks(struct vine_manager *q) +{ + debug(D_VINE, "Return recovery tasks enabled"); + q->return_recovery_tasks = 1; + return 1; +} + +int vine_disable_return_recovery_tasks(struct vine_manager *q) +{ + debug(D_VINE, "Return recovery tasks disabled"); + q->return_recovery_tasks = 0; + return 1; +} + int vine_enable_proportional_resources(struct vine_manager *q) { debug(D_VINE, "Proportional resources enabled"); @@ -4464,6 +4608,8 @@ void vine_delete(struct vine_manager *q) * disable the immediate recovery to avoid submitting recovery tasks for lost files */ q->immediate_recovery = 0; + q->shutting_down = 1; + vine_fair_write_workflow_info(q); release_all_workers(q); @@ -4487,8 +4633,7 @@ void vine_delete(struct vine_manager *q) hash_table_clear(q->file_worker_table, (void *)set_delete); hash_table_delete(q->file_worker_table); - hash_table_clear(q->temp_files_to_replicate, 0); - hash_table_delete(q->temp_files_to_replicate); + priority_queue_delete(q->temp_files_to_replicate); hash_table_clear(q->factory_table, (void *)vine_factory_info_delete); hash_table_delete(q->factory_table); @@ -4515,6 +4660,7 @@ void vine_delete(struct vine_manager *q) hash_table_clear(q->categories, (void *)category_free); hash_table_delete(q->categories); + list_delete(q->pending_tasks); priority_queue_delete(q->ready_tasks); itable_delete(q->running_table); list_delete(q->waiting_retrieval_list); @@ -4662,20 +4808,18 @@ char *vine_monitor_wrap(struct vine_manager *q, struct vine_worker_info *w, stru return wrap_cmd; } -/* Put a given task on the ready list, taking into account the task priority and the manager schedule. */ - +/* Put a given task on the ready queue, taking into account the task priority and the manager schedule. */ static void push_task_to_ready_tasks(struct vine_manager *q, struct vine_task *t) { if (t->result == VINE_RESULT_RESOURCE_EXHAUSTION) { /* when a task is resubmitted given resource exhaustion, we - * increment its priority by 1, so it gets to run as soon + * increment its priority a bit, so it gets to run as soon * as possible among those with the same priority. This avoids * the issue in which all 'big' tasks fail because the first * allocation is too small. */ - priority_queue_push(q->ready_tasks, t, t->priority + 1); - } else { - priority_queue_push(q->ready_tasks, t, t->priority); + t->priority *= 1.05; } + priority_queue_push(q->ready_tasks, t, t->priority); /* If the task has been used before, clear out accumulated state. */ vine_task_clean(t); @@ -4869,6 +5013,7 @@ int vine_submit(struct vine_manager *q, struct vine_task *t) * this distinction is important when many files are lost and the workflow is effectively rerun from scratch. */ if (t->type == VINE_TASK_TYPE_RECOVERY) { vine_task_set_priority(t, t->priority + priority_queue_get_top_priority(q->ready_tasks) + 1); + q->num_submitted_recovery_tasks++; } if (t->has_fixed_locations) { @@ -5009,6 +5154,7 @@ void vine_manager_remove_library(struct vine_manager *q, const char *name) struct vine_task *library = vine_schedule_find_library(q, w, name); while (library) { vine_cancel_by_task_id(q, library->task_id); + itable_remove(w->current_libraries, library->task_id); library = vine_schedule_find_library(q, w, name); } hash_table_remove(q->library_templates, name); @@ -5123,6 +5269,7 @@ static int poll_active_workers(struct vine_manager *q, int stoptime) // promptly dispatch tasks, while avoiding wasting cpu cycles when the // state of the system cannot be advanced. int msec = q->nothing_happened_last_wait_cycle ? 1000 : 0; + msec = 0; if (stoptime) { msec = MIN(msec, (stoptime - time(0)) * 1000); } @@ -5232,7 +5379,11 @@ struct vine_task *find_task_to_return(struct vine_manager *q, const char *tag, i return t; break; case VINE_TASK_TYPE_RECOVERY: - /* do nothing and let vine_manager_consider_recovery_task do its job */ + /* if configured to return recovery tasks, return them to the user */ + if (q->return_recovery_tasks) { + return t; + } + /* otherwise, do nothing and let vine_manager_consider_recovery_task do its job */ break; case VINE_TASK_TYPE_LIBRARY_INSTANCE: /* silently delete the task, since it was created by the manager. @@ -5397,27 +5548,6 @@ static struct vine_task *vine_wait_internal(struct vine_manager *q, int timeout, } while (q->max_retrievals < 0 || retrieved_this_cycle < q->max_retrievals || !priority_queue_size(q->ready_tasks)); END_ACCUM_TIME(q, time_receive); - // check for tasks that cannot run at all - BEGIN_ACCUM_TIME(q, time_internal); - result = expire_waiting_tasks(q); - END_ACCUM_TIME(q, time_internal); - if (result > 0) { - retrieved_this_cycle += result; - events++; - } - - // only check for fixed location if any are present (high overhead) - if (q->fixed_location_in_queue) { - - BEGIN_ACCUM_TIME(q, time_internal); - result = enforce_waiting_fixed_locations(q); - END_ACCUM_TIME(q, time_internal); - if (result > 0) { - retrieved_this_cycle += result; - events++; - } - } - if (retrieved_this_cycle) { // reset the rotate cursor on task retrieval priority_queue_rotate_reset(q->ready_tasks); @@ -5446,9 +5576,15 @@ static struct vine_task *vine_wait_internal(struct vine_manager *q, int timeout, } } + // Check if any worker is overloaded and rebalance the disk usage + if (q->balance_worker_disk_load && (timestamp_get() - q->when_last_offloaded > 5 * 1e6)) { + rebalance_worker_disk_usage(q); + q->when_last_offloaded = timestamp_get(); + } + // Check if any temp files need replication and start replicating BEGIN_ACCUM_TIME(q, time_internal); - result = consider_tempfile_replications(q); + result = vine_temp_start_replication(q); END_ACCUM_TIME(q, time_internal); if (result) { // recovered at least one temp file @@ -5497,7 +5633,7 @@ static struct vine_task *vine_wait_internal(struct vine_manager *q, int timeout, // in this wait. if (events > 0) { BEGIN_ACCUM_TIME(q, time_internal); - int done = !priority_queue_size(q->ready_tasks) && !list_size(q->waiting_retrieval_list) && !itable_size(q->running_table); + int done = !priority_queue_size(q->ready_tasks) && !list_size(q->pending_tasks) && !list_size(q->waiting_retrieval_list) && !itable_size(q->running_table); END_ACCUM_TIME(q, time_internal); if (done) { @@ -5973,13 +6109,24 @@ int vine_tune(struct vine_manager *q, const char *name, double value) } else if (!strcmp(name, "max-library-retries")) { q->max_library_retries = MIN(1, value); + } else if (!strcmp(name, "disk-proportion-available-to-task")) { if (value < 1 && value > 0) { q->disk_proportion_available_to_task = value; } + } else if (!strcmp(name, "enforce-worker-eviction-interval")) { q->enforce_worker_eviction_interval = (timestamp_t)(MAX(0, (int)value) * ONE_SECOND); + } else if (!strcmp(name, "balance-worker-disk-load")) { + q->balance_worker_disk_load = !!((int)value); + + } else if (!strcmp(name, "enable-debug-log")) { + if (value == 0) { + debug_flags_clear(); + debug_close(); + } + } else { debug(D_NOTICE | D_VINE, "Warning: tuning parameter \"%s\" not recognized\n", name); return -1; @@ -6438,16 +6585,18 @@ Should be invoked by the application when a file will never be needed again, to free up available space. */ -void vine_prune_file(struct vine_manager *m, struct vine_file *f) +int vine_prune_file(struct vine_manager *m, struct vine_file *f) { if (!f) { - return; + return 0; } if (!m) { - return; + return 0; } + int pruned_replica_count = 0; + /* delete all of the replicas present at remote workers. */ struct set *source_workers = hash_table_lookup(m->file_worker_table, f->cached_name); if (source_workers && set_size(source_workers) > 0) { @@ -6456,13 +6605,13 @@ void vine_prune_file(struct vine_manager *m, struct vine_file *f) for (int i = 0; workers_array[i] != NULL; i++) { struct vine_worker_info *w = (struct vine_worker_info *)workers_array[i]; delete_worker_file(m, w, f->cached_name, 0, 0); + pruned_replica_count++; } set_free_values_array(workers_array); } } - /* also remove from the replication table. */ - hash_table_remove(m->temp_files_to_replicate, f->cached_name); + return pruned_replica_count; } /* diff --git a/taskvine/src/manager/vine_manager.h b/taskvine/src/manager/vine_manager.h index bcf2405616..17adb70ed3 100644 --- a/taskvine/src/manager/vine_manager.h +++ b/taskvine/src/manager/vine_manager.h @@ -102,6 +102,7 @@ struct vine_manager { /* Primary data structures for tracking task state. */ struct itable *tasks; /* Maps task_id -> vine_task of all tasks in any state. */ + struct list *pending_tasks; /* List of vine_task that are waiting to be dispatched. */ struct priority_queue *ready_tasks; /* Priority queue of vine_task that are waiting to execute. */ struct itable *running_table; /* Table of vine_task that are running at workers. */ struct list *waiting_retrieval_list; /* List of vine_task that are waiting to be retrieved. */ @@ -123,7 +124,7 @@ struct vine_manager { struct hash_table *file_table; /* Maps fileid -> struct vine_file.* */ struct hash_table *file_worker_table; /* Maps cachename -> struct set of workers with a replica of the file.* */ - struct hash_table *temp_files_to_replicate; /* Maps cachename -> NULL. Used as a set of temp files to be replicated */ + struct priority_queue *temp_files_to_replicate; /* Maps cachename -> NULL. Used as a set of temp files to be replicated */ /* Primary scheduling controls. */ @@ -232,6 +233,13 @@ struct vine_manager { double sandbox_grow_factor; /* When task disk sandboxes are exhausted, increase the allocation using their measured valued times this factor */ double disk_proportion_available_to_task; /* intentionally reduces disk allocation for tasks to reserve some space for cache growth. */ + int return_recovery_tasks; /* If true, recovery tasks are returned by vine_wait to the user. By default they are handled internally. */ + int num_submitted_recovery_tasks; + int balance_worker_disk_load; /* If true, offload replicas from workers that are overloaded with temp files. */ + timestamp_t when_last_offloaded; + int64_t peak_used_cache; + int shutting_down; + /* todo: confirm datatype. int or int64 */ int max_task_stdout_storage; /* Maximum size of standard output from task. (If larger, send to a separate file.) */ int max_new_workers; /* Maximum number of workers to add in a single cycle before dealing with other matters. */ @@ -291,6 +299,15 @@ void vine_manager_remove_worker(struct vine_manager *q, struct vine_worker_info /* Check if the worker is able to transfer the necessary files for this task. */ int vine_manager_transfer_capacity_available(struct vine_manager *q, struct vine_worker_info *w, struct vine_task *t); +/* Delete a file from a worker. */ +int delete_worker_file(struct vine_manager *q, struct vine_worker_info *w, const char *filename, vine_cache_level_t cache_level, vine_cache_level_t delete_upto_level); + +/* Evict a random worker to simulate a worker failure. */ +int evict_random_worker(struct vine_manager *q); + +/* Get the available disk space in bytes for a worker. */ +int64_t get_worker_available_disk_bytes(struct vine_worker_info *w); + /* The expected format of files created by the resource monitor.*/ #define RESOURCE_MONITOR_TASK_LOCAL_NAME "vine-task-%d" #define RESOURCE_MONITOR_REMOTE_NAME "cctools-monitor" diff --git a/taskvine/src/manager/vine_runtime_dir.c b/taskvine/src/manager/vine_runtime_dir.c index a2bc4194a0..5f5f586b32 100644 --- a/taskvine/src/manager/vine_runtime_dir.c +++ b/taskvine/src/manager/vine_runtime_dir.c @@ -134,6 +134,11 @@ char *vine_runtime_directory_create() return runtime_dir; } +char *vine_get_runtime_directory(struct vine_manager *m) +{ + return m->runtime_directory; +} + char *vine_get_path_log(struct vine_manager *m, const char *path) { return string_format("%s/vine-logs%s%s", m->runtime_directory, path ? "/" : "", path ? path : ""); diff --git a/taskvine/src/manager/vine_schedule.c b/taskvine/src/manager/vine_schedule.c index 0168c3a979..f4ce49f2c5 100644 --- a/taskvine/src/manager/vine_schedule.c +++ b/taskvine/src/manager/vine_schedule.c @@ -152,6 +152,51 @@ int check_worker_have_enough_disk_with_inputs(struct vine_manager *q, struct vin return ok; } +/* Count the number of committable cores for all workers. */ +int vine_schedule_count_committable_cores(struct vine_manager *q) +{ + int count = 0; + + char *key; + struct vine_worker_info *w; + HASH_TABLE_ITERATE(q->worker_table, key, w) + { + /* skip if the worker hasn't reported any resources yet */ + if (!w->resources) { + continue; + } + /* skip if the worker has no cores or gpus */ + if (w->resources->cores.total <= 0 && w->resources->gpus.total <= 0) { + continue; + } + /* count the number of free slots on running libraries */ + if (w->current_libraries && itable_size(w->current_libraries) > 0) { + uint64_t library_task_id = 0; + struct vine_task *library_task = NULL; + ITABLE_ITERATE(w->current_libraries, library_task_id, library_task) + { + if (!library_task || !library_task->provides_library) { + continue; + } + if (library_task->function_slots_total > library_task->function_slots_inuse) { + count += library_task->function_slots_total - library_task->function_slots_inuse; + } + } + } + /* count the number of free cores */ + if (w->resources->cores.total > 0 && overcommitted_resource_total(q, w->resources->cores.total) > w->resources->cores.inuse) { + count += overcommitted_resource_total(q, w->resources->cores.total) - w->resources->cores.inuse; + } + /* count the number of free gpus */ + if (w->resources->gpus.total > 0 && overcommitted_resource_total(q, w->resources->gpus.total) > w->resources->gpus.inuse) { + // Don't count gpus for now, because the manager has not yet fully supported scheduling tasks to GPUs. + // count += overcommitted_resource_total(q, w->resources->gpus.total) - w->resources->gpus.inuse; + } + } + + return count; +} + /* Check if this worker has committable resources for any type of task. * If it returns false, neither a function task, library task nor a regular task can run on this worker. * If it returns true, the worker has either free slots for function calls or sufficient resources for regular tasks. @@ -534,7 +579,6 @@ This is quite an expensive function and so is invoked only periodically. void vine_schedule_check_for_large_tasks(struct vine_manager *q) { - int t_idx; struct vine_task *t; int unfit_core = 0; int unfit_mem = 0; @@ -543,10 +587,7 @@ void vine_schedule_check_for_large_tasks(struct vine_manager *q) struct rmsummary *largest_unfit_task = rmsummary_create(-1); - int iter_count = 0; - int iter_depth = priority_queue_size(q->ready_tasks); - - PRIORITY_QUEUE_BASE_ITERATE(q->ready_tasks, t_idx, t, iter_count, iter_depth) + LIST_ITERATE(q->pending_tasks, t) { // check each task against the queue of connected workers vine_resource_bitmask_t bit_set = is_task_larger_than_any_worker(q, t); @@ -605,6 +646,5 @@ int vine_schedule_check_fixed_location(struct vine_manager *q, struct vine_task return 1; } } - debug(D_VINE, "Missing fixed_location dependencies for task: %d", t->task_id); return 0; -} +} \ No newline at end of file diff --git a/taskvine/src/manager/vine_schedule.h b/taskvine/src/manager/vine_schedule.h index 4ef0c613a9..6455c18798 100644 --- a/taskvine/src/manager/vine_schedule.h +++ b/taskvine/src/manager/vine_schedule.h @@ -24,4 +24,6 @@ int vine_schedule_check_fixed_location(struct vine_manager *q, struct vine_task int vine_schedule_in_ramp_down(struct vine_manager *q); struct vine_task *vine_schedule_find_library(struct vine_manager *q, struct vine_worker_info *w, const char *library_name); int check_worker_against_task(struct vine_manager *q, struct vine_worker_info *w, struct vine_task *t); +int vine_schedule_count_committable_cores(struct vine_manager *q); + #endif diff --git a/taskvine/src/manager/vine_task.c b/taskvine/src/manager/vine_task.c index 8b13b7488e..fa61fe8813 100644 --- a/taskvine/src/manager/vine_task.c +++ b/taskvine/src/manager/vine_task.c @@ -88,8 +88,15 @@ struct vine_task *vine_task_create(const char *command_line) void vine_task_clean(struct vine_task *t) { + t->time_when_scheduling_start = 0; + t->time_when_scheduling_end = 0; + t->time_when_commit_start = 0; t->time_when_commit_end = 0; + + t->time_when_get_result_start = 0; + t->time_when_get_result_end = 0; + t->time_when_retrieval = 0; t->time_when_done = 0; @@ -153,6 +160,15 @@ void vine_task_reset(struct vine_task *t) t->time_workers_execute_exhaustion = 0; t->time_workers_execute_failure = 0; + t->time_when_scheduling_start = 0; + t->time_when_scheduling_end = 0; + + t->time_when_commit_start = 0; + t->time_when_commit_end = 0; + + t->time_when_get_result_start = 0; + t->time_when_get_result_end = 0; + rmsummary_delete(t->resources_measured); rmsummary_delete(t->resources_allocated); t->resources_measured = rmsummary_create(-1); diff --git a/taskvine/src/manager/vine_task.h b/taskvine/src/manager/vine_task.h index d97e0b9530..4867e4f5a6 100644 --- a/taskvine/src/manager/vine_task.h +++ b/taskvine/src/manager/vine_task.h @@ -106,9 +106,15 @@ struct vine_task { timestamp_t time_when_submitted; /**< The time at which this task was added to the queue. */ timestamp_t time_when_done; /**< The time at which the task is mark as retrieved, after transfering output files and other final processing. */ + timestamp_t time_when_scheduling_start; /**< The time when the task starts to be considered for scheduling. */ + timestamp_t time_when_scheduling_end; /**< The time when the task is mapped to a worker and ready to be committed. */ + timestamp_t time_when_commit_start; /**< The time when the task starts to be transfered to a worker. */ timestamp_t time_when_commit_end; /**< The time when the task is completely transfered to a worker. */ + timestamp_t time_when_get_result_start; /**< The time when the task starts to get the result from the worker. */ + timestamp_t time_when_get_result_end; /**< The time when the task gets the result from the worker. */ + timestamp_t time_when_retrieval; /**< The time when output files start to be transfered back to the manager. time_done - time_when_retrieval is the time taken to transfer output files. */ timestamp_t time_when_last_failure; /**< If larger than 0, the time at which the last task failure was detected. */ diff --git a/taskvine/src/manager/vine_temp.c b/taskvine/src/manager/vine_temp.c new file mode 100644 index 0000000000..2f171f5efe --- /dev/null +++ b/taskvine/src/manager/vine_temp.c @@ -0,0 +1,215 @@ +#include "vine_temp.h" +#include "priority_queue.h" +#include "vine_file.h" +#include "vine_worker_info.h" +#include "vine_file_replica_table.h" +#include "macros.h" +#include "stringtools.h" +#include "vine_manager.h" +#include "debug.h" +#include "random.h" +#include "vine_manager_put.h" +#include "xxmalloc.h" + +/*************************************************************/ +/* Private Functions */ +/*************************************************************/ + +static struct vine_worker_info *get_best_source_worker(struct vine_manager *q, struct vine_file *f) +{ + if (!q || !f || f->type != VINE_TEMP) { + return NULL; + } + + struct set *sources = hash_table_lookup(q->file_worker_table, f->cached_name); + if (!sources) { + return NULL; + } + + struct priority_queue *valid_sources_queue = priority_queue_create(0); + struct vine_worker_info *w = NULL; + SET_ITERATE(sources, w) + { + /* skip if transfer port is not active or in draining mode */ + if (!w->transfer_port_active || w->draining) { + continue; + } + /* skip if incoming transfer counter is too high */ + if (w->outgoing_xfer_counter >= q->worker_source_max_transfers) { + continue; + } + /* skip if the worker does not have this file */ + struct vine_file_replica *replica = vine_file_replica_table_lookup(w, f->cached_name); + if (!replica) { + continue; + } + /* skip if the file is not ready */ + if (replica->state != VINE_FILE_REPLICA_STATE_READY) { + continue; + } + /* those with less outgoing_xfer_counter are preferred */ + priority_queue_push(valid_sources_queue, w, -w->outgoing_xfer_counter); + } + + struct vine_worker_info *best_source = priority_queue_pop(valid_sources_queue); + priority_queue_delete(valid_sources_queue); + + return best_source; +} + +static struct vine_worker_info *get_best_dest_worker(struct vine_manager *q, struct vine_file *f) +{ + if (!q || !f || f->type != VINE_TEMP) { + return NULL; + } + + struct priority_queue *valid_destinations = priority_queue_create(0); + + char *key; + struct vine_worker_info *w; + HASH_TABLE_ITERATE(q->worker_table, key, w) + { + /* skip if transfer port is not active or in draining mode */ + if (!w->transfer_port_active || w->draining) { + continue; + } + /* skip if the incoming transfer counter is too high */ + if (w->incoming_xfer_counter >= q->worker_source_max_transfers) { + continue; + } + /* skip if the worker already has this file */ + struct vine_file_replica *replica = vine_file_replica_table_lookup(w, f->cached_name); + if (replica) { + continue; + } + /* skip if the worker does not have enough disk space */ + int64_t available_disk_space = get_worker_available_disk_bytes(w); + if ((int64_t)f->size > available_disk_space) { + continue; + } + /* workers with more available disk space are preferred to hold the file */ + priority_queue_push(valid_destinations, w, available_disk_space); + } + + struct vine_worker_info *best_destination = priority_queue_pop(valid_destinations); + priority_queue_delete(valid_destinations); + + return best_destination; +} + +void vine_temp_start_peer_transfer(struct vine_manager *q, struct vine_file *f, struct vine_worker_info *source_worker, struct vine_worker_info *dest_worker) +{ + if (!q || !f || f->type != VINE_TEMP || !source_worker || !dest_worker) { + return; + } + + char *source_addr = string_format("%s/%s", source_worker->transfer_url, f->cached_name); + vine_manager_put_url_now(q, dest_worker, source_worker, source_addr, f); + free(source_addr); +} + +int vine_temp_replicate_file_now(struct vine_manager *q, struct vine_file *f) +{ + if (!q || !f || f->type != VINE_TEMP) { + return 0; + } + + struct vine_worker_info *source_worker = get_best_source_worker(q, f); + if (!source_worker) { + return 0; + } + + struct vine_worker_info *dest_worker = get_best_dest_worker(q, f); + if (!dest_worker) { + return 0; + } + + vine_temp_start_peer_transfer(q, f, source_worker, dest_worker); + + return 1; +} + +int vine_temp_start_replication(struct vine_manager *q) +{ + if (!q) { + return 0; + } + + int processed = 0; + int iter_count = 0; + int iter_depth = MIN(q->attempt_schedule_depth, priority_queue_size(q->temp_files_to_replicate)); + struct list *skipped = list_create(); + + struct vine_file *f; + while ((f = priority_queue_pop(q->temp_files_to_replicate)) && (iter_count++ < iter_depth)) { + if (!f || f->type != VINE_TEMP || f->state != VINE_FILE_STATE_CREATED) { + continue; + } + + /* skip if the file has enough replicas or no replicas */ + int current_replica_count = vine_file_replica_count(q, f); + if (current_replica_count >= q->temp_replica_count || current_replica_count == 0) { + continue; + } + /* skip if the file has no ready replicas */ + int current_ready_replica_count = vine_file_replica_table_count_replicas(q, f->cached_name, VINE_FILE_REPLICA_STATE_READY); + if (current_ready_replica_count == 0) { + continue; + } + + /* if reach here, it means the file needs to be replicated and there is at least one ready replica. */ + if (!vine_temp_replicate_file_now(q, f)) { + list_push_tail(skipped, f); + continue; + } + + processed++; + + /* push back and keep evaluating the same file with a lower priority, until no more source + * or destination workers are available, or the file has enough replicas. */ + vine_temp_replicate_file_later(q, f); + } + + while ((f = list_pop_head(skipped))) { + vine_temp_replicate_file_later(q, f); + } + list_delete(skipped); + + return processed; +} + +/*************************************************************/ +/* Public Functions */ +/*************************************************************/ + +int vine_temp_replicate_file_later(struct vine_manager *q, struct vine_file *f) +{ + if (!q || !f || f->type != VINE_TEMP || f->state != VINE_FILE_STATE_CREATED) { + return 0; + } + + int current_replica_count = vine_file_replica_count(q, f); + if (current_replica_count == 0 || current_replica_count >= q->temp_replica_count) { + return 0; + } + + priority_queue_push(q->temp_files_to_replicate, f, -current_replica_count); + + return 1; +} + +int vine_temp_handle_file_lost(struct vine_manager *q, char *cachename) +{ + if (!q || !cachename) { + return 0; + } + + struct vine_file *f = hash_table_lookup(q->file_table, cachename); + if (!f || f->type != VINE_TEMP || f->state != VINE_FILE_STATE_CREATED) { + return 0; + } + + vine_temp_replicate_file_later(q, f); + + return 1; +} diff --git a/taskvine/src/manager/vine_temp.h b/taskvine/src/manager/vine_temp.h new file mode 100644 index 0000000000..ff2efb5011 --- /dev/null +++ b/taskvine/src/manager/vine_temp.h @@ -0,0 +1,12 @@ +#ifndef vine_temp_H +#define vine_temp_H + +#include "vine_manager.h" + +int vine_temp_replicate_file_now(struct vine_manager *q, struct vine_file *f); +int vine_temp_replicate_file_later(struct vine_manager *q, struct vine_file *f); +int vine_temp_handle_file_lost(struct vine_manager *q, char *cachename); +int vine_temp_start_replication(struct vine_manager *q); +void vine_temp_start_peer_transfer(struct vine_manager *q, struct vine_file *f, struct vine_worker_info *source_worker, struct vine_worker_info *dest_worker); + +#endif \ No newline at end of file diff --git a/taskvine/src/worker/vine_worker.c b/taskvine/src/worker/vine_worker.c index d285d5bb4c..b830fb2249 100644 --- a/taskvine/src/worker/vine_worker.c +++ b/taskvine/src/worker/vine_worker.c @@ -270,6 +270,10 @@ void deliver_async_messages(struct link *l) void send_async_message(struct link *l, const char *fmt, ...) { + if (!l) { + return; + } + va_list va; char *message = malloc(VINE_LINE_MAX); va_start(va, fmt); @@ -498,6 +502,10 @@ its size in bytes and transfer time in usec. void vine_worker_send_cache_update(struct link *manager, const char *cachename, struct vine_cache_file *f) { + if (!manager) { + return; + } + char *transfer_id = hash_table_remove(current_transfers, cachename); if (!transfer_id) { transfer_id = xxstrdup("X"); @@ -524,6 +532,10 @@ could not be loaded. Accompanied by a corresponding error message. void vine_worker_send_cache_invalid(struct link *manager, const char *cachename, const char *message) { + if (!manager) { + return; + } + int length = strlen(message); char *transfer_id = hash_table_remove(current_transfers, cachename); if (transfer_id) { @@ -1694,7 +1706,7 @@ static void vine_worker_serve_manager(struct link *manager) hence a maximum wait time of five seconds is enforced. */ - int wait_msec = 5000; + int wait_msec = 0; if (sigchld_received_flag) { wait_msec = 0;