diff --git a/.proj.toml b/.proj.toml index 463aa0bb07..38690f710b 100644 --- a/.proj.toml +++ b/.proj.toml @@ -78,12 +78,12 @@ has-cpu-only-benchmarks = false has-cuda-tests = false has-cuda-benchmarks = false -# [targets.local-execution] -# type = "lib" -# has-cpu-only-tests = true -# has-cpu-only-benchmarks = false -# has-cuda-tests = true -# has-cuda-benchmarks = false +[targets.local-execution] +type = "lib" +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = true +has-cuda-benchmarks = false # [targets.local-pcg-execution] # type = "lib" diff --git a/lib/local-execution/include/local-execution/atomic_training_tensor_guid_t.dtg.toml b/lib/local-execution/include/local-execution/atomic_training_tensor_guid_t.dtg.toml deleted file mode 100644 index 12380d80ba..0000000000 --- a/lib/local-execution/include/local-execution/atomic_training_tensor_guid_t.dtg.toml +++ /dev/null @@ -1,17 +0,0 @@ -namespace = "FlexFlow" -name = "atomic_training_tensor_guid_t" -type = "struct" -features = [ - "eq", - "ord", - "hash", - "fmt", -] - -includes = [ - "utils/nonnegative_int/nonnegative_int.h" -] - -[[fields]] -name = "raw_index" -type = "::FlexFlow::nonnegative_int" diff --git a/lib/local-execution/include/local-execution/computation_graph_instance/README.md b/lib/local-execution/include/local-execution/computation_graph_instance/README.md index 6b7f4b43db..89493dcdbb 100644 --- a/lib/local-execution/include/local-execution/computation_graph_instance/README.md +++ b/lib/local-execution/include/local-execution/computation_graph_instance/README.md @@ -1 +1,14 @@ -The primary external-facing interface of local-execution +The primary external-facing interface of local-execution. + +Flow: + +* input (from compiler): `ComputationGraph` +* `create_computation_graph_instance()` => `ComputationGraphInstance` +* `initialize_computation_graph_instance()` => `InitializedComputationGraphInstance` +* execute (TBD) + +Details: + +* `ComputationGraph` is the unexpanded form of the graph: no passes, no parallelism, etc. +* `create_computation_graph_instance()` takes the `ComputationGraph` and expands it into a `DynamicOpenDataflowGraph`. This form has passes and updates but no allocations and no parallelism. (Note because this is the *local* executor there will be no parallelism.) This version gets stored in the `ComputationGraphInstance`. +* `initialize_computation_graph_instance()` takes the `ComputationGraphInstance`, along with user-provided input tensors. It allocates any remaining (not-user-provided) tensors and performs initialization (cuBLAS handles, etc.). These get stored in a new `DynamicOpenDataflowGraph` which gets wrapped in `InitializedComputationGraphInstance`. (The old `DynamicOpenDataflowGraph` is treated as immutable and is not modified.) This form is fully specified and ready for (single-device) execution. diff --git a/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h b/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h index f28552603f..22016db50b 100644 --- a/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h +++ b/lib/local-execution/include/local-execution/computation_graph_instance/computation_graph_instance.h @@ -1,45 +1,49 @@ #ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COMPUTATION_GRAPH_INSTANCE_H #define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COMPUTATION_GRAPH_INSTANCE_H -#include "kernels/accessor.h" -#include "local-execution/computation_graph_training_tensor_ref_t.dtg.h" -#include "local-execution/local_task_registry.dtg.h" -#include "local-execution/local_tensor_backing.dtg.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/profiling_settings.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/layer_guid_t.dtg.h" -#include "task-spec/symbolic/training_symbolic_computation_graph_from_cg_conversion.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h" +#include "task-spec/ff_iteration_config.dtg.h" #include "utils/units/milliseconds_t.h" -#include namespace FlexFlow { struct ComputationGraphInstance { public: - ComputationGraphInstance() = delete; - - explicit ComputationGraphInstance( - TrainingSymbolicComputationGraphFromCgConversion const &, - LocalTensorBacking const &, - LocalTaskRegistry const &); - -public: - TrainingSymbolicComputationGraphFromCgConversion const & - get_symbolic_training_graph_for_cg() const; - LocalTensorBacking const &get_tensor_backing() const; - LocalTaskRegistry const &get_task_registry() const; + ComputationGraphInstance(DynamicOpenDataflowGraph, Allocator &); + DynamicOpenDataflowGraph const &get_dynamic_dataflow_graph() const; + Allocator &get_allocator() const; private: - TrainingSymbolicComputationGraphFromCgConversion - symbolic_training_graph_for_cg; - LocalTensorBacking tensor_backing; - LocalTaskRegistry task_registry; + DynamicOpenDataflowGraph initialized_dataflow_graph; + Allocator &allocator; }; -ComputationGraphInstance create_computation_graph_instance( - ComputationGraph const &, - bidict> const - &); +ComputationGraphInstance initialize_computation_graph_instance( + ComputationGraph const &cg, + OptimizerAttrs const &optimizer, + std::unordered_map const &, + Allocator &, + ProfilingSettings const &, + device_handle_t const &, + DeviceType, + FFIterationConfig const &, + size_t); + +std::unordered_map> + perform_forward_pass_for_computation_graph_instance( + ComputationGraphInstance const &); + +std::unordered_map> + perform_backward_pass_for_computation_graph_instance( + ComputationGraphInstance const &); + +void perform_update_pass_for_computation_graph_instance( + ComputationGraphInstance const &); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.dtg.toml b/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.dtg.toml deleted file mode 100644 index 8589d5edec..0000000000 --- a/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.dtg.toml +++ /dev/null @@ -1,35 +0,0 @@ -namespace = "FlexFlow" -name = "InitializedComputationGraphInstance" -type = "struct" -features = [ - "eq", - "ord", - "hash", - "json", - "fmt", - "rapidcheck", -] - -includes = [ - # "local-execution/computation_graph_instance.dtg.h", - # "local-execution/local_device_states_backing.dtg.h", -] - -src_includes = [] - -fields = [] -# [[fields]] -# name = "per_device_op_states" -# type = "::FlexFlow::LocalDeviceStatesBacking" -# -# [[fields]] -# name = "allocator" -# type = "::FlexFlow::Allocator" -# -# [[fields]] -# name = "atomic_tensor_backing" -# type = "::FlexFlow::LocalAtomicTensorBacking" -# -# [[fields]] -# name = "computation_graph_instance" -# type = "::FlexFlow::ComputationGraphInstance" diff --git a/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.h b/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.h deleted file mode 100644 index a014ff596d..0000000000 --- a/lib/local-execution/include/local-execution/computation_graph_instance/initialized_computation_graph_instance.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_INITIALIZED_COMPUTATION_GRAPH_INSTANCE_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_INITIALIZED_COMPUTATION_GRAPH_INSTANCE_H - -#include "local-execution/computation_graph_instance/computation_graph_instance.h" -#include "local-execution/local_atomic_tensor_backing.dtg.h" -#include "local-execution/local_device_states_backing.dtg.h" -#include "local-execution/local_task_registry.dtg.h" -#include "local-execution/local_tensor_backing.dtg.h" -#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h" -#include "task-spec/symbolic/training_symbolic_computation_graph_from_cg_conversion.dtg.h" -#include "utils/units/milliseconds_t.h" - -namespace FlexFlow { - -struct InitializedComputationGraphInstance { -public: - LocalTensorBacking const &get_tensor_backing() const; - LocalTaskRegistry const &get_task_registry() const; - TrainingSymbolicComputationGraphFromCgConversion const & - get_symbolic_training_graph_for_cg() const; - LocalAtomicTensorBacking const &get_atomic_tensor_backing() const; - Allocator &get_allocator() const; - RuntimeArgConfig const &get_runtime_arg_config() const; - -private: - LocalDeviceStatesBacking per_device_op_states; - Allocator &allocator; - LocalAtomicTensorBacking atomic_tensor_backing; - ComputationGraphInstance computation_graph_instance; -}; - -InitializedComputationGraphInstance - initialize_computation_graph_instance(ComputationGraphInstance const &, - Allocator &); - -std::unordered_map> - perform_forward_pass_for_computation_graph_instance( - InitializedComputationGraphInstance const &); - -std::unordered_map> - perform_backward_pass_for_computation_graph_instance( - InitializedComputationGraphInstance const &); - -void perform_update_pass_for_computation_graph_instance( - InitializedComputationGraphInstance const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/computation_graph_training_tensor_ref_t.dtg.toml b/lib/local-execution/include/local-execution/computation_graph_training_tensor_ref_t.dtg.toml deleted file mode 100644 index d25dc407e2..0000000000 --- a/lib/local-execution/include/local-execution/computation_graph_training_tensor_ref_t.dtg.toml +++ /dev/null @@ -1,24 +0,0 @@ -namespace = "FlexFlow" -name = "computation_graph_training_tensor_ref_t" -type = "struct" -features = [ - "eq", - "ord", - "hash", - "json", - "fmt", - "rapidcheck", -] - -includes = [ - "pcg/tensor_guid_t.dtg.h" , - "task-spec/op_training_tensor_type.dtg.h", -] - -[[fields]] -name = "tensor_guid" -type = "::FlexFlow::tensor_guid_t" - -[[fields]] -name = "tensor_type" -type = "::FlexFlow::OpTrainingTensorType" diff --git a/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h b/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h index ba5b511227..d07a8b731b 100644 --- a/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h +++ b/lib/local-execution/include/local-execution/cost_estimator/local_cost_estimator.h @@ -1,3 +1,5 @@ +#if 0 // FIXME (Elliott): fix cost estimator + #ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COST_ESTIMATOR_LOCAL_COST_ESTIMATOR_H #define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_COST_ESTIMATOR_LOCAL_COST_ESTIMATOR_H @@ -33,3 +35,5 @@ CostEstimator get_local_cost_estimator(RuntimeArgConfig const &); } // namespace FlexFlow #endif + +#endif diff --git a/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h b/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h index 0b531f9b3d..79a62b628a 100644 --- a/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/cost_estimator/tracked_allocator.h @@ -1,3 +1,5 @@ +#if 0 // FIXME (Elliott): fix cost estimator + #ifndef _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H @@ -33,3 +35,5 @@ size_t get_tracked_memory_usage(Allocator &wrapped_allocator); } // namespace FlexFlow #endif + +#endif diff --git a/lib/local-execution/include/local-execution/execute_task_for_layer.h b/lib/local-execution/include/local-execution/execute_task_for_layer.h index 587ff96687..e306c013cd 100644 --- a/lib/local-execution/include/local-execution/execute_task_for_layer.h +++ b/lib/local-execution/include/local-execution/execute_task_for_layer.h @@ -1,16 +1,11 @@ +#if 0 // FIXME (Elliott): fix execute task + #ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_EXECUTE_TASK_FOR_LAYER_H #define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_EXECUTE_TASK_FOR_LAYER_H -#include "local-execution/local_atomic_tensor_backing.dtg.h" -#include "local-execution/local_ready_to_launch_task.dtg.h" +#include "local-execution/local_concrete_task_invocation.dtg.h" #include "local-execution/local_task_registry.dtg.h" -#include "local-execution/local_tensor_backing.dtg.h" #include "pcg/layer_guid_t.dtg.h" -#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h" -#include "task-spec/runtime_task_invocation/runtime_task_invocation.dtg.h" -#include "task-spec/symbolic/symbolic_cg_op_attrs_and_training_signature_with_shapes.dtg.h" -#include "task-spec/symbolic/training_symbolic_computation_graph.dtg.h" -#include "task-spec/symbolic/training_symbolic_computation_graph_from_cg_conversion.dtg.h" #include "utils/units/milliseconds_t.h" namespace FlexFlow { @@ -31,24 +26,6 @@ std::optional execute_init_for_layer( LocalTaskRegistry const &, RuntimeArgConfig const &); -std::optional execute_forward_for_layer( - symbolic_layer_guid_t, - SymbolicCgOpAttrsAndTrainingSignatureWithShapes const &, - LocalTensorBacking const &, - LocalAtomicTensorBacking const &, - Allocator &, - LocalTaskRegistry const &, - RuntimeArgConfig const &); - -std::optional execute_backward_for_layer( - symbolic_layer_guid_t, - SymbolicCgOpAttrsAndTrainingSignatureWithShapes const &, - LocalTensorBacking const &, - LocalAtomicTensorBacking const &, - Allocator &, - LocalTaskRegistry const &, - RuntimeArgConfig const &); - void execute_compute_loss(TrainingSymbolicComputationGraph const &, LocalTensorBacking const &, LocalAtomicTensorBacking const &, @@ -85,3 +62,5 @@ std::unordered_map> } // namespace FlexFlow #endif + +#endif diff --git a/lib/local-execution/include/local-execution/local_atomic_tensor_backing.dtg.toml b/lib/local-execution/include/local-execution/local_atomic_tensor_backing.dtg.toml deleted file mode 100644 index 5fe6b05b52..0000000000 --- a/lib/local-execution/include/local-execution/local_atomic_tensor_backing.dtg.toml +++ /dev/null @@ -1,21 +0,0 @@ -namespace = "FlexFlow" -name = "LocalAtomicTensorBacking" -type = "struct" -features = [ - "eq", - "fmt", -] - -includes = [ - "kernels/accessor.h", - "local-execution/atomic_training_tensor_guid_t.dtg.h", -] - -src_includes = [ - "utils/fmt/unordered_map.h", -] - - -[[fields]] -name = "accessor_from_atomic_tensor_map" -type = "std::unordered_map<::FlexFlow::atomic_training_tensor_guid_t, ::FlexFlow::GenericTensorAccessorW>" diff --git a/lib/local-execution/include/local-execution/local_atomic_tensor_backing.h b/lib/local-execution/include/local-execution/local_atomic_tensor_backing.h deleted file mode 100644 index 11f9f3e56a..0000000000 --- a/lib/local-execution/include/local-execution/local_atomic_tensor_backing.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_ATOMIC_TENSOR_BACKING_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_ATOMIC_TENSOR_BACKING_H - -#include "kernels/allocation.h" -#include "local-execution/atomic_task_invocation.dtg.h" -#include "local-execution/local_atomic_tensor_backing.dtg.h" -#include "local-execution/tensor_slot_backing.dtg.h" -#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h" -#include "task-spec/task_argument_accessor/task_argument_accessor.h" - -namespace FlexFlow { - -std::unordered_map - construct_tensor_slots_backing_for_binding(LocalAtomicTensorBacking const &, - AtomicTaskBinding const &); - -TaskArgumentAccessor get_task_arg_accessor_for_atomic_task_binding( - LocalAtomicTensorBacking const &, AtomicTaskBinding const &, Allocator &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/local_concrete_task_graph.dtg.toml b/lib/local-execution/include/local-execution/local_concrete_task_graph.dtg.toml deleted file mode 100644 index 8dde33a49a..0000000000 --- a/lib/local-execution/include/local-execution/local_concrete_task_graph.dtg.toml +++ /dev/null @@ -1,24 +0,0 @@ -namespace = "FlexFlow" -name = "LocalConcreteTaskGraph" -type = "struct" -features = [ - "eq", - "ord", - "hash", - "json", - "fmt", - "rapidcheck", -] - -includes = [ - "local-execution/local_concrete_task_invocation.dtg.h", -] - -src_includes = [ - "utils/hash/unordered_set.h", - "utils/fmt/unordered_set.h", -] - -[[fields]] -name = "task_invocations" -type = "std::unordered_set<::FlexFlow::LocalConcreteTaskInvocation>" diff --git a/lib/local-execution/include/local-execution/local_concrete_task_graph.h b/lib/local-execution/include/local-execution/local_concrete_task_graph.h deleted file mode 100644 index c2f8c405b0..0000000000 --- a/lib/local-execution/include/local-execution/local_concrete_task_graph.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_CONCRETE_TASK_GRAPH_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_CONCRETE_TASK_GRAPH_H - -#include "local-execution/local_concrete_task_graph.dtg.h" - -namespace FlexFlow { - -std::vector - local_concrete_task_graph_topological_ordering( - LocalConcreteTaskGraph const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/local_device_states_backing.dtg.toml b/lib/local-execution/include/local-execution/local_device_states_backing.dtg.toml deleted file mode 100644 index 350bf7756f..0000000000 --- a/lib/local-execution/include/local-execution/local_device_states_backing.dtg.toml +++ /dev/null @@ -1,14 +0,0 @@ -namespace = "FlexFlow" -name = "LocalDeviceStatesBacking" -type = "struct" -features = [] - -includes = [ - "task-spec/device_specific_per_device_op_state.dtg.h", - "pcg/layer_guid_t.dtg.h", - "", -] - -[[fields]] -name = "per_device_op_states" -type = "std::unordered_map<::FlexFlow::layer_guid_t, std::optional<::FlexFlow::DeviceSpecificPerDeviceOpState>>" diff --git a/lib/local-execution/include/local-execution/local_device_states_backing.h b/lib/local-execution/include/local-execution/local_device_states_backing.h deleted file mode 100644 index 5650197e44..0000000000 --- a/lib/local-execution/include/local-execution/local_device_states_backing.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_DEVICE_STATES_BACKING_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_DEVICE_STATES_BACKING_H - -#include "local-execution/local_device_states_backing.dtg.h" -#include "local-execution/local_task_argument_accessor.h" -#include "local-execution/local_task_registry.dtg.h" -#include "local-execution/local_tensor_backing.dtg.h" -#include "pcg/computation_graph.h" -#include "task-spec/per_device_op_state.h" -#include "task-spec/symbolic/symbolic_layer_training_tensor_group_signature_with_shapes.dtg.h" - -namespace FlexFlow { - -LocalDeviceStatesBacking make_local_device_states_backing_for_computation_graph( - LocalTaskRegistry const &, - std::unordered_map< - layer_guid_t, - SymbolicLayerTrainingTensorGroupSignatureWithShapes> const &, - RuntimeArgConfig const &runtime_arg_config, - LocalTensorBacking const &, - Allocator &); - -std::optional - get_per_device_op_state_if_exists(LocalDeviceStatesBacking const &, - layer_guid_t const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h index 53026f81fd..d5945c8bb3 100644 --- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h @@ -1,20 +1,19 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H -#include "local-execution/tensor_slot_backing.dtg.h" +#include "kernels/accessor.h" #include "pcg/device_id_t.dtg.h" -#include "task-spec/runtime_task_invocation/runtime_arg_config.dtg.h" -#include "task-spec/task_argument_accessor/task_argument_accessor.h" +#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h" +#include "task-spec/task_argument_accessor/itask_argument_accessor.h" #include "task-spec/task_argument_accessor/task_tensor_parameter.dtg.h" #include -#include namespace FlexFlow { struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { explicit LocalTaskArgumentAccessor( Allocator const &allocator, - std::unordered_map const + std::unordered_map const &tensor_slots_backing, ProfilingSettings const &profiling_settings, device_handle_t const &ff_handle, @@ -49,7 +48,7 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { private: Allocator allocator; - std::unordered_map + std::unordered_map tensor_slots_backing; ProfilingSettings profiling_settings; diff --git a/lib/local-execution/include/local-execution/local_task_registry.dtg.toml b/lib/local-execution/include/local-execution/local_task_registry.dtg.toml deleted file mode 100644 index 056fe39ca7..0000000000 --- a/lib/local-execution/include/local-execution/local_task_registry.dtg.toml +++ /dev/null @@ -1,21 +0,0 @@ -namespace = "FlexFlow" -name = "LocalTaskRegistry" -type = "struct" -features = [ - "eq", - "fmt", - "hash" -] - -includes = [ - "task-spec/task_impl_function.dtg.h", -] - -src_includes = [ - "utils/hash/unordered_map.h", - "utils/fmt/unordered_map.h", -] - -[[fields]] -name = "task_mapping" -type = "std::unordered_map<::FlexFlow::task_id_t, ::FlexFlow::TaskImplFunction>" diff --git a/lib/local-execution/include/local-execution/local_task_registry.h b/lib/local-execution/include/local-execution/local_task_registry.h index 6adacab0a9..8279d1d4c4 100644 --- a/lib/local-execution/include/local-execution/local_task_registry.h +++ b/lib/local-execution/include/local-execution/local_task_registry.h @@ -1,29 +1,30 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H -#include "local-execution/local_task_registry.dtg.h" #include "pcg/layer_attrs.dtg.h" #include "task-spec/device_specific_per_device_op_state.dtg.h" -#include "task-spec/ops/op_task_type.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "utils/units/milliseconds_t.h" +#include namespace FlexFlow { -LocalTaskRegistry construct_local_task_registry_for_layers( - std::unordered_set const &); +std::optional + get_init_task_impl_for_op_attrs(ComputationGraphOpAttrs const &); +std::optional + get_fwd_task_impl_for_op_attrs(ComputationGraphOpAttrs const &); +std::optional + get_bwd_task_impl_for_op_attrs(ComputationGraphOpAttrs const &); std::optional - call_init_task_impl(LocalTaskRegistry const &local_task_registry, - task_id_t task_id, + call_init_task_impl(ComputationGraphOpAttrs const &, TaskArgumentAccessor const &arg_accessor); std::optional - call_fwb_task_impl(LocalTaskRegistry const &local_task_registry, - task_id_t task_id, + call_fwd_task_impl(ComputationGraphOpAttrs const &, TaskArgumentAccessor const &arg_accessor); -void call_generic_task_impl(LocalTaskRegistry const &local_task_registry, - task_id_t task_id, +void call_generic_task_impl(ComputationGraphOpAttrs const &, TaskArgumentAccessor const &arg_accessor); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/operator_task_set.dtg.toml b/lib/local-execution/include/local-execution/operator_task_set.dtg.toml deleted file mode 100644 index b074d981d1..0000000000 --- a/lib/local-execution/include/local-execution/operator_task_set.dtg.toml +++ /dev/null @@ -1,25 +0,0 @@ -namespace = "FlexFlow" -name = "OperatorTaskSet" -type = "struct" -features = [ - "eq", - "ord", - "hash", - "fmt", -] - -includes = [ - "local-execution/task_id_with_noop_default_t.dtg.h" -] - -[[fields]] -name = "init_task" -type = "::FlexFlow::task_id_with_noop_default_t" - -[[fields]] -name = "fwd_task" -type = "::FlexFlow::task_id_with_noop_default_t" - -[[fields]] -name = "bwd_task" -type = "::FlexFlow::task_id_with_noop_default_t" diff --git a/lib/local-execution/include/local-execution/operator_task_set.h b/lib/local-execution/include/local-execution/operator_task_set.h deleted file mode 100644 index b94ed9ac47..0000000000 --- a/lib/local-execution/include/local-execution/operator_task_set.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPERATOR_TASK_SET_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPERATOR_TASK_SET_H - -#include "local-execution/operator_task_set.dtg.h" -#include "op-attrs/computation_graph_op_attrs.dtg.h" -#include "task-spec/ops/op_task_type.dtg.h" -#include "utils/bidict/bidict.h" - -namespace FlexFlow { - -bidict - get_map_from_task_type_to_task(OperatorTaskSet const &); -std::unordered_set - get_all_tasks_in_task_set(OperatorTaskSet const &); - -task_id_with_noop_default_t - get_task_for_task_type(OperatorTaskSet const &op_task_set, - OpTaskType task_type); - -OperatorTaskSet - get_task_set_for_operator(ComputationGraphOpAttrs const &op_attrs); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/per_device_op_state_initialization.h b/lib/local-execution/include/local-execution/per_device_op_state_initialization.h deleted file mode 100644 index 31f8958a1c..0000000000 --- a/lib/local-execution/include/local-execution/per_device_op_state_initialization.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_PER_DEVICE_OP_STATE_INITIALIZATION_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_PER_DEVICE_OP_STATE_INITIALIZATION_H - -#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.dtg.h" -namespace FlexFlow { - -DynamicOpenDataflowGraph perform_per_device_op_state_initialization( - DynamicOpenDataflowGraph const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/task_execution.h b/lib/local-execution/include/local-execution/task_execution.h index 215f1dbc08..91cea6f025 100644 --- a/lib/local-execution/include/local-execution/task_execution.h +++ b/lib/local-execution/include/local-execution/task_execution.h @@ -12,12 +12,14 @@ namespace FlexFlow { TaskArgumentAccessor make_task_argument_accessor_for_invocation( DynamicNodeInvocation const &invocation, ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, DeviceType kernel_device_type, PCGOperatorAttrs op_attrs, std::optional const &loss_attrs, std::optional const &per_device_op_state, FFIterationConfig iteration_config, - std::optional const &optimizer_attrs); + std::optional const &optimizer_attrs, + size_t device_idx); void execute_dynamic_node_invocation( DynamicNodeInvocation const &invocation, diff --git a/lib/local-execution/include/local-execution/task_id_with_noop_default_t.h b/lib/local-execution/include/local-execution/task_id_with_noop_default_t.h deleted file mode 100644 index 72e151bcc8..0000000000 --- a/lib/local-execution/include/local-execution/task_id_with_noop_default_t.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_TASK_ID_WITH_NOOP_DEFAULT_T_H -#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_TASK_ID_WITH_NOOP_DEFAULT_T_H - -#include "local-execution/task_id_with_noop_default_t.dtg.h" - -namespace FlexFlow { - -task_id_with_noop_default_t make_default_noop_task(); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/tensor_allocation.h b/lib/local-execution/include/local-execution/tensor_allocation.h index 67acb3de70..e0e5e939f4 100644 --- a/lib/local-execution/include/local-execution/tensor_allocation.h +++ b/lib/local-execution/include/local-execution/tensor_allocation.h @@ -14,7 +14,7 @@ DynamicValueAttrs perform_tensor_allocation_for_value(DynamicValueAttrs const &, DynamicOpenDataflowGraph perform_tensor_allocation( DynamicOpenDataflowGraph const &, - std::unordered_map const + std::unordered_map const &preallocated, Allocator &); diff --git a/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc new file mode 100644 index 0000000000..ed0591fb42 --- /dev/null +++ b/lib/local-execution/src/local-execution/computation_graph_instance/computation_graph_instance.cc @@ -0,0 +1,135 @@ +#include "local-execution/computation_graph_instance/computation_graph_instance.h" +#include "kernels/allocation.h" +#include "local-execution/local_task_registry.h" +#include "local-execution/task_execution.h" +#include "local-execution/tensor_allocation.h" +#include "op-attrs/computation_graph_op_attrs.h" +#include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h" +#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h" +#include "task-spec/dynamic_graph/make_dynamic_open_dataflow_graph_from_cg.h" +#include "task-spec/dynamic_graph/pass_expansion.h" +#include "task-spec/dynamic_graph/update_insertion.h" +#include "task-spec/task_argument_accessor/task_argument_accessor.h" +#include "utils/containers/all_are_true.h" +#include "utils/containers/transform.h" +#include "utils/exception.h" +#include "utils/optional.h" +#include + +namespace FlexFlow { + +bool no_nodes_are_initialized(DynamicOpenDataflowGraph const &g) { + return all_are_true( + transform(get_dynamic_nodes(g), [](DynamicNodeAttrs const &n) -> bool { + return !n.per_device_op_state.has_value(); + })); +} + +bool all_nodes_are_initialized(DynamicOpenDataflowGraph const &g) { + return all_are_true( + transform(get_dynamic_nodes(g), [](DynamicNodeAttrs const &n) -> bool { + return n.per_device_op_state.has_value(); + })); +} + +ComputationGraphInstance::ComputationGraphInstance( + DynamicOpenDataflowGraph dg, Allocator &alloc) + : initialized_dataflow_graph(dg), allocator(alloc) {} + +DynamicNodeInvocation + initialize_node(DynamicNodeInvocation const &i, + Allocator &allocator, + ProfilingSettings const &profiling_settings, + device_handle_t const &device_handle, + DeviceType kernel_device_type, + FFIterationConfig const &iteration_config, + size_t device_idx) { + // Get op + ComputationGraphOpAttrs op_attrs = + assert_unwrap(compgraph_op_attrs_from_pcg_op_attrs( + assert_unwrap(i.node_attrs.op_attrs))); + + // Prepare arguments + TaskArgumentAccessor arg_accessor = + make_task_argument_accessor_for_invocation( + /*invocation=*/i, + /*profiling_settings=*/profiling_settings, + /*ff_handle=*/device_handle, + /*kernel_device_type=*/kernel_device_type, + /*op_attrs=*/assert_unwrap(i.node_attrs.op_attrs), + /*loss_attrs=*/std::nullopt, + /*per_device_op_state=*/std::nullopt, + /*iteration_config=*/iteration_config, + /*optimizer_attrs=*/std::nullopt, + /*device_idx=*/device_idx); + + // Run task init + std::optional per_device_op_state = + call_init_task_impl(op_attrs, arg_accessor); + + DynamicNodeAttrs node_attrs{ + /*task_type=*/i.node_attrs.task_type, + /*device_coord=*/i.node_attrs.device_coord, + /*mapping=*/i.node_attrs.mapping, + /*op_attrs=*/i.node_attrs.op_attrs, + /*layer_guid=*/i.node_attrs.layer_guid, + /*per_device_op_state=*/per_device_op_state, + }; + return DynamicNodeInvocation{ + /*inputs=*/ + i.inputs, + /*node_attrs=*/ + node_attrs, + /*outputs=*/ + i.outputs, + }; +} + +ComputationGraphInstance initialize_computation_graph_instance( + ComputationGraph const &cg, + OptimizerAttrs const &optimizer, + std::unordered_map const + &input_tensors, + Allocator &allocator, + ProfilingSettings const &profiling_settings, + device_handle_t const &device_handle, + DeviceType kernel_device_type, + FFIterationConfig const &iteration_config, + size_t device_idx) { + DynamicOpenDataflowGraph dg = make_dynamic_open_dataflow_graph_from_cg(cg); + dg = perform_pass_expansion(dg); + dg = perform_update_insertion(dg, optimizer); + dg = perform_tensor_allocation(dg, input_tensors, allocator); + + // Initialize all operators and save the per-device op state + ASSERT(no_nodes_are_initialized(dg)); + dg = transform_dynamic_invocation_set( + dg, [&](DynamicNodeInvocation const &invocation) { + return initialize_node(invocation, + allocator, + profiling_settings, + device_handle, + kernel_device_type, + iteration_config, + device_idx); + }); + ASSERT(all_nodes_are_initialized(dg)); + + return ComputationGraphInstance{dg, allocator}; +} + +std::unordered_map> + perform_forward_pass_for_computation_graph_instance( + ComputationGraphInstance const &instance) { + + NOT_IMPLEMENTED(); +} + +std::unordered_map> + perform_backward_pass_for_computation_graph_instance( + ComputationGraphInstance const &instance) { + + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/computation_graph_instance/initialized_computation_graph_instance.cc b/lib/local-execution/src/local-execution/computation_graph_instance/initialized_computation_graph_instance.cc deleted file mode 100644 index a9f7018bb2..0000000000 --- a/lib/local-execution/src/local-execution/computation_graph_instance/initialized_computation_graph_instance.cc +++ /dev/null @@ -1,19 +0,0 @@ -#include "local-execution/computation_graph_instance/initialized_computation_graph_instance.h" - -namespace FlexFlow { - -std::unordered_map> - perform_forward_pass_for_computation_graph_instance( - InitializedComputationGraphInstance const &instance) { - - NOT_IMPLEMENTED(); -} - -std::unordered_map> - perform_backward_pass_for_computation_graph_instance( - InitializedComputationGraphInstance const &instance) { - - NOT_IMPLEMENTED(); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc b/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc index fc181d26b0..79e2dcafb2 100644 --- a/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc +++ b/lib/local-execution/src/local-execution/cost_estimator/local_cost_estimator.cc @@ -1,3 +1,5 @@ +#if 0 // FIXME (Elliott): fix cost estimator + #include "local-execution/cost_estimator/local_cost_estimator.h" #include "compiler/machine_mapping/machine_view.dtg.h" #include "kernels/create_local_allocator_for_device_type.h" @@ -151,3 +153,5 @@ CostEstimator } } // namespace FlexFlow + +#endif diff --git a/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc b/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc index 3ac7352e59..2930ba0c86 100644 --- a/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc +++ b/lib/local-execution/src/local-execution/cost_estimator/tracked_allocator.cc @@ -1,3 +1,5 @@ +#if 0 // FIXME (Elliott): fix cost estimator + #include "local-execution/tracked_allocator.h" #include "kernels/device.h" @@ -33,3 +35,5 @@ Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { } } // namespace FlexFlow + +#endif diff --git a/lib/local-execution/src/local-execution/execute_task_for_layer.cc b/lib/local-execution/src/local-execution/execute_task_for_layer.cc index 5a7ea74e52..2c25f0b60f 100644 --- a/lib/local-execution/src/local-execution/execute_task_for_layer.cc +++ b/lib/local-execution/src/local-execution/execute_task_for_layer.cc @@ -1,12 +1,7 @@ +#if 0 // FIXME (Elliott): fix execute task + #include "local-execution/execute_task_for_layer.h" -#include "local-execution/atomic_task_binding.dtg.h" -#include "local-execution/local_atomic_tensor_backing.h" -#include "local-execution/local_ready_to_launch_task.dtg.h" #include "local-execution/local_task_registry.h" -#include "local-execution/local_tensor_backing.h" -#include "task-spec/fwb_op_task_type.h" -#include "task-spec/runtime_task_invocation/runtime_task_invocation.dtg.h" -#include "task-spec/symbolic/training_symbolic_computation_graph.h" #include "utils/containers/flatmap.h" namespace FlexFlow { @@ -272,3 +267,5 @@ std::unordered_map> } } // namespace FlexFlow + +#endif diff --git a/lib/local-execution/src/local-execution/local_atomic_tensor_backing.cc b/lib/local-execution/src/local-execution/local_atomic_tensor_backing.cc deleted file mode 100644 index c43fd6bdf3..0000000000 --- a/lib/local-execution/src/local-execution/local_atomic_tensor_backing.cc +++ /dev/null @@ -1,35 +0,0 @@ -#include "local-execution/local_atomic_tensor_backing.h" -#include "local-execution/local_task_argument_accessor.h" -#include "utils/containers/map_values.h" - -namespace FlexFlow { - -std::unordered_map - construct_tensor_slots_backing_for_binding( - LocalAtomicTensorBacking const &tensor_backing, - AtomicTaskBinding const &binding) { - return map_values(binding.tensor_bindings, - [&](atomic_training_tensor_guid_t t) -> TensorSlotBacking { - return TensorSlotBacking{ - tensor_backing.accessor_from_atomic_tensor_map.at(t), - }; - }); -} - -TaskArgumentAccessor get_task_arg_accessor_for_atomic_task_invocation( - LocalAtomicTensorBacking const &local_tensor_backing, - AtomicTaskBinding const &atomic_task_binding, - Allocator &allocator) { - - std::unordered_map - tensor_slots_backing = construct_tensor_slots_backing_for_binding( - local_tensor_backing, atomic_task_binding); - - std::unordered_map arg_slots_backing = - atomic_task_binding.arg_bindings; - - return TaskArgumentAccessor::create( - allocator, tensor_slots_backing, arg_slots_backing, 0); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_concrete_task_graph.cc b/lib/local-execution/src/local-execution/local_concrete_task_graph.cc deleted file mode 100644 index 9806758f06..0000000000 --- a/lib/local-execution/src/local-execution/local_concrete_task_graph.cc +++ /dev/null @@ -1,12 +0,0 @@ -#include "local-execution/local_concrete_task_graph.h" - -namespace FlexFlow { - -std::vector - local_concrete_task_graph_topological_ordering( - LocalConcreteTaskGraph const &) { - - NOT_IMPLEMENTED(); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_device_states_backing.cc b/lib/local-execution/src/local-execution/local_device_states_backing.cc deleted file mode 100644 index 1dc34b120d..0000000000 --- a/lib/local-execution/src/local-execution/local_device_states_backing.cc +++ /dev/null @@ -1,48 +0,0 @@ -#include "local-execution/local_device_states_backing.h" -#include "local-execution/local_task_registry.h" -#include "local-execution/local_tensor_backing.h" -#include "task-spec/task_signature_impl.h" -#include "utils/containers/generate_map.h" -#include "utils/containers/keys.h" -#include "utils/overload.h" - -namespace FlexFlow { - -// LocalDeviceStatesBacking -// make_local_device_states_backing_for_computation_graph( -// LocalTaskRegistry const &task_registry, -// std::unordered_map const &layers, -// std::unordered_map const -// &op_attrs, RuntimeArgConfig const &runtime_arg_config, LocalTensorBacking -// const &local_tensor_backing, Allocator &allocator) { -// -// std::unordered_map> -// per_device_op_states = generate_map( -// keys(layers), -// [&](layer_guid_t const &layer_guid) -> -// std::optional { -// return create_per_device_op_state( -// task_registry, -// local_tensor_backing, -// runtime_arg_config, -// allocator, -// op_attrs, -// layers.at(layer_guid)); -// }); -// -// return LocalDeviceStatesBacking{ -// per_device_op_states, -// }; -// } - -// std::optional -// get_per_device_op_state_if_exists( -// LocalArgsBacking const &local_args_backing, -// layer_guid_t const &layer_guid) { -// -// return local_args_backing.per_device_op_states.at(layer_guid); -// } - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_task_argument_accessor.cc b/lib/local-execution/src/local-execution/local_task_argument_accessor.cc index c9bdb84fbf..8ab9e02544 100644 --- a/lib/local-execution/src/local-execution/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local-execution/local_task_argument_accessor.cc @@ -1,15 +1,13 @@ #include "local-execution/local_task_argument_accessor.h" +#include "kernels/accessor.h" #include "pcg/device_id_t.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/transform.h" -#include "utils/hash/pair.h" -#include "utils/overload.h" +#include "utils/optional.h" namespace FlexFlow { LocalTaskArgumentAccessor::LocalTaskArgumentAccessor( Allocator const &allocator, - std::unordered_map const + std::unordered_map const &tensor_slots_backing, ProfilingSettings const &profiling_settings, device_handle_t const &ff_handle, @@ -28,18 +26,21 @@ LocalTaskArgumentAccessor::LocalTaskArgumentAccessor( device_idx(make_device_id_t_from_idx(nonnegative_int{device_idx}, kernel_device_type)) {} -GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( - TensorSlotName slot, - Permissions priv, - TrainingTensorType tensor_type) const { - GenericTensorAccessorW tensor_backing = - this->tensor_slots_backing.at(slot_tensor_type).require_single(); +GenericTensorAccessor + LocalTaskArgumentAccessor::get_tensor(TaskTensorParameter slot, + Permissions priv) const { + DynamicTensorAccessor tensor_backing = this->tensor_slots_backing.at(slot); if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = - read_only_accessor_from_write_accessor(tensor_backing); - return readonly_tensor_backing; + if (tensor_backing.has()) { + return tensor_backing.get(); + } else { + GenericTensorAccessorR readonly_tensor_backing = + read_only_accessor_from_write_accessor( + tensor_backing.get()); + return readonly_tensor_backing; + } } else if (priv == Permissions::RW || priv == Permissions::WO) { - return tensor_backing; + return tensor_backing.get(); } else { PANIC(fmt::format("Unhandled privilege mode {}", priv)); } @@ -81,7 +82,7 @@ Allocator LocalTaskArgumentAccessor::get_allocator() const { return this->allocator; } -size_t LocalTaskArgumentAccessor::get_device_idx() const { +device_id_t LocalTaskArgumentAccessor::get_device_idx() const { return this->device_idx; } diff --git a/lib/local-execution/src/local-execution/local_task_registry.cc b/lib/local-execution/src/local-execution/local_task_registry.cc index fb6936425d..f836e29764 100644 --- a/lib/local-execution/src/local-execution/local_task_registry.cc +++ b/lib/local-execution/src/local-execution/local_task_registry.cc @@ -1,49 +1,163 @@ #include "local-execution/local_task_registry.h" -#include "local-execution/operator_task_set.h" #include "pcg/computation_graph.h" -#include "task-spec/task_signature_impl.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/filtrans.h" -#include "utils/containers/flatmap.h" -#include "utils/containers/generate_map.h" -#include "utils/containers/map_values.h" -#include "utils/containers/try_at.h" -#include "utils/containers/values.h" +#include "task-spec/ops/impl/attention.h" +#include "task-spec/ops/impl/batch_matmul.h" +#include "task-spec/ops/impl/batch_norm.h" +#include "task-spec/ops/impl/broadcast.h" +#include "task-spec/ops/impl/cast.h" +#include "task-spec/ops/impl/concat.h" +#include "task-spec/ops/impl/conv_2d.h" +#include "task-spec/ops/impl/dropout.h" +#include "task-spec/ops/impl/element_binary.h" +#include "task-spec/ops/impl/element_unary.h" +#include "task-spec/ops/impl/embedding.h" +#include "task-spec/ops/impl/flat.h" +#include "task-spec/ops/impl/gather.h" +#include "task-spec/ops/impl/layer_norm.h" +#include "task-spec/ops/impl/linear.h" +#include "task-spec/ops/impl/pool_2d.h" +#include "task-spec/ops/impl/reduce.h" +#include "task-spec/ops/impl/reshape.h" +#include "task-spec/ops/impl/reverse.h" +#include "task-spec/ops/impl/softmax.h" +#include "task-spec/ops/impl/split.h" +#include "task-spec/ops/impl/topk.h" +#include "task-spec/ops/impl/transpose.h" +#include "task-spec/task_impl_function.dtg.h" +#include "utils/optional.h" +#include "utils/overload.h" +#include namespace FlexFlow { -LocalTaskRegistry construct_local_task_registry_for_layers( - std::unordered_set const &op_attrs) { - - std::unordered_set task_ids = flatmap( - op_attrs, - [](ComputationGraphOpAttrs const &op_attrs) - -> std::unordered_set { return get_task_ids(op_attrs); }); +std::optional + get_init_task_impl_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) { + + return op_attrs.visit>(overload{ + [](BatchMatmulAttrs const &) { return std::nullopt; }, + [](BatchNormAttrs const &) { return get_batch_norm_init_task_impl(); }, + [](BroadcastAttrs const &) { return std::nullopt; }, + [](CastAttrs const &) { return std::nullopt; }, + [](ConcatAttrs const &) { return std::nullopt; }, + [](Conv2DAttrs const &) { return get_conv_2d_init_task_impl(); }, + [](DropoutAttrs const &) { return get_dropout_init_task_impl(); }, + [](ElementBinaryAttrs const &) { + return get_element_binary_init_task_impl(); + }, + [](ElementUnaryAttrs const &) { + return get_element_unary_init_task_impl(); + }, + [](EmbeddingAttrs const &) { return std::nullopt; }, + [](FlatAttrs const &) { return std::nullopt; }, + [](GatherAttrs const &) { return get_gather_init_task_impl(); }, + [](InputAttrs const &) { return std::nullopt; }, + [](LayerNormAttrs const &) { return get_layer_norm_init_task_impl(); }, + [](LinearAttrs const &) { return get_linear_init_task_impl(); }, + [](MultiHeadAttentionAttrs const &) { + return get_attention_init_task_impl(); + }, + [](NoopAttrs const &) { return std::nullopt; }, + [](Pool2DAttrs const &) { return get_pool_2d_init_task_impl(); }, + [](ReduceAttrs const &) { return get_reduce_init_task_impl(); }, + [](ReshapeAttrs const &) { return std::nullopt; }, + [](ReverseAttrs const &) { return std::nullopt; }, + [](SoftmaxAttrs const &) { return get_softmax_init_task_impl(); }, + [](SplitAttrs const &) { return std::nullopt; }, + [](TopKAttrs const &) { return std::nullopt; }, + [](TransposeAttrs const &) { return std::nullopt; }, + [](WeightAttrs const &) { return std::nullopt; }, + }); +} - std::unordered_map task_mapping = - generate_map(task_ids, get_task_signature_and_impl_for_task_id); +std::optional + get_fwd_task_impl_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) { + + return op_attrs.visit>(overload{ + [](BatchMatmulAttrs const &) { return get_batch_matmul_fwd_task_impl(); }, + [](BatchNormAttrs const &) { return get_batch_norm_fwd_task_impl(); }, + [](BroadcastAttrs const &) { return get_broadcast_fwd_task_impl(); }, + [](CastAttrs const &) { return get_cast_fwd_task_impl(); }, + [](ConcatAttrs const &) { return get_concat_fwd_task_impl(); }, + [](Conv2DAttrs const &) { return get_conv_2d_fwd_task_impl(); }, + [](DropoutAttrs const &) { return get_dropout_fwd_task_impl(); }, + [](ElementBinaryAttrs const &) { + return get_element_binary_fwd_task_impl(); + }, + [](ElementUnaryAttrs const &) { + return get_element_unary_fwd_task_impl(); + }, + [](EmbeddingAttrs const &) { return get_embedding_fwd_task_impl(); }, + [](FlatAttrs const &) { return get_flat_fwd_task_impl(); }, + [](GatherAttrs const &) { return get_gather_fwd_task_impl(); }, + [](InputAttrs const &) { return std::nullopt; }, + [](LayerNormAttrs const &) { return get_layer_norm_fwd_task_impl(); }, + [](LinearAttrs const &) { return get_linear_fwd_task_impl(); }, + [](MultiHeadAttentionAttrs const &) { + return get_attention_fwd_task_impl(); + }, + [](NoopAttrs const &) { return std::nullopt; }, + [](Pool2DAttrs const &) { return get_pool_2d_fwd_task_impl(); }, + [](ReduceAttrs const &) { return get_reduce_fwd_task_impl(); }, + [](ReshapeAttrs const &) { return get_reshape_fwd_task_impl(); }, + [](ReverseAttrs const &) { return get_reverse_fwd_task_impl(); }, + [](SoftmaxAttrs const &) { return get_softmax_fwd_task_impl(); }, + [](SplitAttrs const &) { return get_split_fwd_task_impl(); }, + [](TopKAttrs const &) { return get_topk_fwd_task_impl(); }, + [](TransposeAttrs const &) { return get_transpose_fwd_task_impl(); }, + [](WeightAttrs const &) { return std::nullopt; }, + }); +} - return LocalTaskRegistry{ - /*task_mapping=*/task_mapping, - }; +std::optional + get_bwd_task_impl_for_op_attrs(ComputationGraphOpAttrs const &op_attrs) { + + return op_attrs.visit>(overload{ + [](BatchMatmulAttrs const &) { return get_batch_matmul_bwd_task_impl(); }, + [](BatchNormAttrs const &) { return get_batch_norm_bwd_task_impl(); }, + [](BroadcastAttrs const &) { return get_broadcast_bwd_task_impl(); }, + [](CastAttrs const &) { return get_cast_bwd_task_impl(); }, + [](ConcatAttrs const &) { return get_concat_bwd_task_impl(); }, + [](Conv2DAttrs const &) { return get_conv_2d_bwd_task_impl(); }, + [](DropoutAttrs const &) { return get_dropout_bwd_task_impl(); }, + [](ElementBinaryAttrs const &) { + return get_element_binary_bwd_task_impl(); + }, + [](ElementUnaryAttrs const &) { + return get_element_unary_bwd_task_impl(); + }, + [](EmbeddingAttrs const &) { return get_embedding_bwd_task_impl(); }, + [](FlatAttrs const &) { return get_flat_bwd_task_impl(); }, + [](GatherAttrs const &) { return get_gather_bwd_task_impl(); }, + [](InputAttrs const &) { return std::nullopt; }, + [](LayerNormAttrs const &) { return get_layer_norm_bwd_task_impl(); }, + [](LinearAttrs const &) { return get_linear_bwd_task_impl(); }, + [](MultiHeadAttentionAttrs const &) { + return get_attention_bwd_task_impl(); + }, + [](NoopAttrs const &) { return std::nullopt; }, + [](Pool2DAttrs const &) { return get_pool_2d_bwd_task_impl(); }, + [](ReduceAttrs const &) { return get_reduce_bwd_task_impl(); }, + [](ReshapeAttrs const &) { return get_reshape_bwd_task_impl(); }, + [](ReverseAttrs const &) { return get_reverse_bwd_task_impl(); }, + [](SoftmaxAttrs const &) { return get_softmax_bwd_task_impl(); }, + [](SplitAttrs const &) { return get_split_bwd_task_impl(); }, + [](TopKAttrs const &) { return get_topk_bwd_task_impl(); }, + [](TransposeAttrs const &) { return get_transpose_bwd_task_impl(); }, + [](WeightAttrs const &) { return std::nullopt; }, + }); } std::optional - call_init_task_impl(LocalTaskRegistry const &local_task_registry, - task_id_with_noop_default_t registered_task, + call_init_task_impl(ComputationGraphOpAttrs const &op_attrs, TaskArgumentAccessor const &arg_accessor) { - - if (registered_task.is_noop_task()) { + std::optional task_impl_fn = + get_init_task_impl_for_op_attrs(op_attrs); + if (!task_impl_fn) { return std::nullopt; } - task_id_t task_id = registered_task.require_real_task(); - - TaskSignatureAndImpl task_sig_impl = - local_task_registry.task_mapping.at(task_id); - auto fn = - task_sig_impl.impl_function.get().function_ptr; + assert_unwrap(task_impl_fn).get().function_ptr; std::optional device_state = fn(arg_accessor); @@ -51,24 +165,23 @@ std::optional } std::optional - call_fwb_task_impl(LocalTaskRegistry const &task_registry, - task_id_t const &task_id, + call_fwb_task_impl(ComputationGraphOpAttrs const &op_attrs, TaskArgumentAccessor const &acc) { - TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); + std::optional task_impl_fn = + get_fwd_task_impl_for_op_attrs(op_attrs); + if (!task_impl_fn) { + return std::nullopt; + } auto fn = - task_sig_impl.impl_function.get().function_ptr; + assert_unwrap(task_impl_fn).get().function_ptr; return fn(acc); } -void call_generic_task_impl(LocalTaskRegistry const &task_registry, - task_id_t const &task_id, +void call_generic_task_impl(ComputationGraphOpAttrs const &op_attrs, TaskArgumentAccessor const &acc) { - TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); - auto fn = - task_sig_impl.impl_function.get().function_ptr; - fn(acc); + NOT_IMPLEMENTED(); } } // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/operator_task_set.cc b/lib/local-execution/src/local-execution/operator_task_set.cc deleted file mode 100644 index a1b1d0817b..0000000000 --- a/lib/local-execution/src/local-execution/operator_task_set.cc +++ /dev/null @@ -1,71 +0,0 @@ -#include "local-execution/operator_task_set.h" -#include "local-execution/task_id_with_noop_default_t.h" -#include "task-spec/task_signature_impl.h" -#include "utils/bidict/algorithms/right_entries.h" -#include "utils/containers/values.h" - -namespace FlexFlow { - -bidict - get_map_from_task_type_to_task(OperatorTaskSet const &op_task_set) { - return { - {OpTaskType::INIT, op_task_set.init_task}, - {OpTaskType::FWD, op_task_set.fwd_task}, - {OpTaskType::BWD, op_task_set.bwd_task}, - }; -} - -std::unordered_set - get_all_tasks_in_task_set(OperatorTaskSet const &op_task_set) { - return right_entries(get_map_from_task_type_to_task(op_task_set)); -} - -task_id_with_noop_default_t - get_task_for_task_type(OperatorTaskSet const &op_task_set, - OpTaskType task_type) { - return get_map_from_task_type_to_task(op_task_set).at_l(task_type); -} - -OperatorTaskSet - get_task_set_for_operator(ComputationGraphOpAttrs const &attrs) { - task_id_with_noop_default_t init_task = make_default_noop_task(); - task_id_with_noop_default_t fwd_task = make_default_noop_task(); - task_id_with_noop_default_t bwd_task = make_default_noop_task(); - - std::vector task_ids = get_task_ids(attrs); - - for (task_id_t const &task_id : task_ids) { - TaskSignatureAndImpl task_signature_and_impl = - get_task_signature_and_impl_for_task_id(task_id); - - OpTaskSignature task_signature = task_signature_and_impl.task_signature; - - switch (task_signature.type) { - case OpTaskType::INIT: - ASSERT(is_invocation_valid(task_signature, - get_init_op_task_invocation(attrs))); - init_task = task_id_with_noop_default_t{task_id}; - break; - case OpTaskType::FWD: - ASSERT(is_invocation_valid(task_signature, - get_forward_op_task_invocation(attrs))); - fwd_task = task_id_with_noop_default_t{task_id}; - break; - case OpTaskType::BWD: - ASSERT(is_invocation_valid(task_signature, - get_backward_op_task_invocation(attrs))); - bwd_task = task_id_with_noop_default_t{task_id}; - break; - default: - PANIC("Unhandled OpTaskType", fmt::to_string(task_signature.type)); - } - } - - return OperatorTaskSet{ - /*init_task=*/init_task, - /*fwd_task=*/fwd_task, - /*bwd_task=*/bwd_task, - }; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/task_execution.cc b/lib/local-execution/src/local-execution/task_execution.cc index 09276aa218..60aaab8f5f 100644 --- a/lib/local-execution/src/local-execution/task_execution.cc +++ b/lib/local-execution/src/local-execution/task_execution.cc @@ -1,5 +1,6 @@ #include "local-execution/task_execution.h" #include "local-execution/local_task_argument_accessor.h" +#include "utils/exception.h" namespace FlexFlow { @@ -7,18 +8,41 @@ TaskArgumentAccessor make_task_argument_accessor_for_invocation( DynamicNodeInvocation const &invocation, Allocator &allocator, ProfilingSettings const &profiling_settings, + device_handle_t const &ff_handle, DeviceType kernel_device_type, PCGOperatorAttrs op_attrs, std::optional const &loss_attrs, std::optional const &per_device_op_state, FFIterationConfig iteration_config, - std::optional const &optimizer_attrs) { - std::unordered_map < + std::optional const &optimizer_attrs, + size_t device_idx) { + std::unordered_map + tensor_slots_backing; + + return TaskArgumentAccessor::create( + /*allocator=*/allocator, + /*tensor_slots_backing=*/tensor_slots_backing, + /*profiling_settings=*/profiling_settings, + /*ff_handle=*/ff_handle, + /*kernel_device_type=*/kernel_device_type, + /*op_attrs=*/op_attrs, + /*loss_attrs=*/loss_attrs, + /*per_device_op_state=*/per_device_op_state, + /*iteration_config=*/iteration_config, + /*optimizer_attrs=*/optimizer_attrs, + /*device_idx=*/device_idx); +} - return TaskArgumentAccessor::create( - /*allocator=*/allocator, - /*tensor_slots_backing=*/ - ); +void execute_dynamic_node_invocation( + DynamicNodeInvocation const &invocation, + ProfilingSettings const &profiling_settings, + DeviceType kernel_device_type, + PCGOperatorAttrs op_attrs, + std::optional const &loss_attrs, + std::optional const &per_device_op_state, + FFIterationConfig iteration_config, + std::optional const &optimizer_attrs) { + NOT_IMPLEMENTED(); } } // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/task_id_with_noop_default_t.cc b/lib/local-execution/src/local-execution/task_id_with_noop_default_t.cc deleted file mode 100644 index 15b2fe786b..0000000000 --- a/lib/local-execution/src/local-execution/task_id_with_noop_default_t.cc +++ /dev/null @@ -1,9 +0,0 @@ -#include "local-execution/task_id_with_noop_default_t.h" - -namespace FlexFlow { - -task_id_with_noop_default_t make_noop_registered_task() { - return task_id_with_noop_default_t{std::monostate{}}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/tensor_allocation.cc b/lib/local-execution/src/local-execution/tensor_allocation.cc index 16d6712616..5ea8c6b022 100644 --- a/lib/local-execution/src/local-execution/tensor_allocation.cc +++ b/lib/local-execution/src/local-execution/tensor_allocation.cc @@ -1,6 +1,7 @@ #include "local-execution/tensor_allocation.h" #include "op-attrs/parallel_tensor_shape.h" #include "task-spec/dynamic_graph/dynamic_open_dataflow_graph.h" +#include "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h" #include "utils/bidict/generate_bidict.h" #include "utils/containers/all_are_true.h" #include "utils/containers/contains_key.h" @@ -35,16 +36,17 @@ DynamicValueAttrs GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); DynamicValueAttrs result = value; - result.accessor = accessor; + result.accessor = DynamicTensorAccessor{accessor}; return result; } DynamicOpenDataflowGraph perform_tensor_allocation( DynamicOpenDataflowGraph const &g, - std::unordered_map const + std::unordered_map const &preallocated, Allocator &allocator) { + ASSERT(no_tensors_are_allocated(g)); for (DynamicValueAttrs const &v : keys(preallocated)) { ASSERT(v.accessor == std::nullopt); } @@ -64,7 +66,7 @@ DynamicOpenDataflowGraph perform_tensor_allocation( } }); - return transform_dynamic_invocation_set( + DynamicOpenDataflowGraph result = transform_dynamic_invocation_set( g, [&](DynamicNodeInvocation const &i) -> DynamicNodeInvocation { return DynamicNodeInvocation{ /*inputs=*/map_values( @@ -80,6 +82,10 @@ DynamicOpenDataflowGraph perform_tensor_allocation( }), }; }); + + ASSERT(all_tensors_are_allocated(result)); + + return result; } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/internal/test_utils.cc b/lib/local-execution/test/src/internal/test_utils.cc index 629640b6ae..4ef8d937b6 100644 --- a/lib/local-execution/test/src/internal/test_utils.cc +++ b/lib/local-execution/test/src/internal/test_utils.cc @@ -12,8 +12,9 @@ size_t MockTensorGuidSource::next_available_mock_tensor_guid = 0; MockTensorGuidSource::MockTensorGuidSource() {} tensor_guid_t MockTensorGuidSource::new_mock_tensor_guid() { + // FIXME (Elliott): where is the guid supposed to go now??? size_t next_guid = MockTensorGuidSource::next_available_mock_tensor_guid++; - return tensor_guid_t{DataflowOutput{Node{0}, nonnegative_int{next_guid}}}; + return tensor_guid_t{KwargDataflowOutput{Node{0}, TensorSlotName::INPUT}}; } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/local-execution/local_cost_estimator.cc b/lib/local-execution/test/src/local-execution/local_cost_estimator.cc index 1e0891e1a3..788817d3ed 100644 --- a/lib/local-execution/test/src/local-execution/local_cost_estimator.cc +++ b/lib/local-execution/test/src/local-execution/local_cost_estimator.cc @@ -1,4 +1,5 @@ -#include "local-execution/local_cost_estimator.h" +#if 0 // FIXME (Elliott): fix cost estimator +#include "local-execution/cost_estimator/local_cost_estimator.h" #include "compiler/machine_mapping/machine_view.h" #include "internal/test_utils.h" #include "kernels/device_handle_t.h" @@ -140,3 +141,4 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } } } +#endif diff --git a/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc b/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc index ff90abcde7..f9eb0796b8 100644 --- a/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc +++ b/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc @@ -1,6 +1,6 @@ #include "local-execution/local_task_argument_accessor.h" #include "kernels/local_cpu_allocator.h" -#include "task-spec/task_signature_impl.h" +#include "task-spec/task_impl_function.dtg.h" #include "utils/fmt/variant.h" #include diff --git a/lib/local-execution/include/local-execution/tensor_slot_backing.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.toml similarity index 50% rename from lib/local-execution/include/local-execution/tensor_slot_backing.dtg.toml rename to lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.toml index 4d8c817461..2cb661992c 100644 --- a/lib/local-execution/include/local-execution/tensor_slot_backing.dtg.toml +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.toml @@ -1,24 +1,20 @@ namespace = "FlexFlow" -name = "TensorSlotBacking" +name = "DynamicTensorAccessor" type = "variant" features = [ "eq", "fmt", + "hash", ] includes = [ "kernels/accessor.h", - "", -] - -src_includes = [ - "utils/fmt/vector.h", ] [[values]] -type = "::FlexFlow::GenericTensorAccessorW" -key = "single" +name = "accessor_r" +type = "::FlexFlow::GenericTensorAccessorR" [[values]] -type = "std::vector<::FlexFlow::GenericTensorAccessorW>" -key = "variadic" +name = "accessor_w" +type = "::FlexFlow::GenericTensorAccessorW" diff --git a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml index 6638f16e62..89b94b1017 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml +++ b/lib/task-spec/include/task-spec/dynamic_graph/dynamic_value_attrs.dtg.toml @@ -11,9 +11,8 @@ includes = [ "", "task-spec/dynamic_graph/dynamic_tensor_guid_t.dtg.h", "op-attrs/parallel_tensor_shape.dtg.h", - "kernels/accessor.h", - "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h", "op-attrs/parallel_tensor_space_coordinate.dtg.h", + "task-spec/dynamic_graph/dynamic_tensor_accessor.dtg.h", "task-spec/dynamic_graph/dynamic_tensor_role.dtg.h", ] @@ -35,7 +34,7 @@ type = "std::optional<::FlexFlow::ParallelTensorSpaceCoordinate>" [[fields]] name = "accessor" -type = "std::optional<::FlexFlow::GenericTensorAccessorW>" +type = "std::optional<::FlexFlow::DynamicTensorAccessor>" [[fields]] name = "role" diff --git a/lib/task-spec/include/task-spec/dynamic_graph/pass_expansion.h b/lib/task-spec/include/task-spec/dynamic_graph/pass_expansion.h index 6dce8ad514..d99f7fb9ce 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/pass_expansion.h +++ b/lib/task-spec/include/task-spec/dynamic_graph/pass_expansion.h @@ -18,6 +18,7 @@ DynamicNodeInvocation DynamicNodeInvocation perform_bwd_pass_expansion_for_invocation(DynamicNodeInvocation const &); +// pass expansion here DynamicOpenDataflowGraph perform_pass_expansion(DynamicOpenDataflowGraph const &); diff --git a/lib/task-spec/include/task-spec/dynamic_graph/shard_expansion.h b/lib/task-spec/include/task-spec/dynamic_graph/shard_expansion.h index 4e0db1cd7e..4cb40fc2b1 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/shard_expansion.h +++ b/lib/task-spec/include/task-spec/dynamic_graph/shard_expansion.h @@ -16,6 +16,7 @@ bool graph_is_fully_shard_expanded(DynamicOpenDataflowGraph const &); std::unordered_set perform_shard_expansion_for_invocation(DynamicNodeInvocation const &); +// shard expansion here DynamicOpenDataflowGraph perform_shard_expansion(DynamicOpenDataflowGraph const &); diff --git a/lib/task-spec/include/task-spec/dynamic_graph/update_insertion.h b/lib/task-spec/include/task-spec/dynamic_graph/update_insertion.h index 23fb7050a0..730de2ec81 100644 --- a/lib/task-spec/include/task-spec/dynamic_graph/update_insertion.h +++ b/lib/task-spec/include/task-spec/dynamic_graph/update_insertion.h @@ -10,6 +10,8 @@ std::unordered_set perform_update_insertion_for_invocation(DynamicNodeInvocation const &, OptimizerAttrs const &); +// after backwards pass, update tasks apply gradients to the weights (and this +// inserts the corresponding task) DynamicOpenDataflowGraph perform_update_insertion(DynamicOpenDataflowGraph const &, OptimizerAttrs const &); diff --git a/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc b/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc index 998d73e9ff..20e0d00c57 100644 --- a/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc +++ b/lib/task-spec/src/task-spec/task_id_with_noop_default_t.cc @@ -46,7 +46,7 @@ task_id_with_noop_default_t return lift_task_id_t(task_id_t::ELEMENTBINARY_INIT_TASK_ID); }, [](ElementUnaryAttrs const &) { - return lift_task_id_t(task_id_t::ELEMENTBINARY_INIT_TASK_ID); + return lift_task_id_t(task_id_t::ELEMENTUNARY_INIT_TASK_ID); }, [](EmbeddingAttrs const &) { return default_noop_task(); }, [](FlatAttrs const &) { return default_noop_task(); }, @@ -111,7 +111,7 @@ task_id_with_noop_default_t return lift_task_id_t(task_id_t::ELEMENTBINARY_FWD_TASK_ID); }, [](ElementUnaryAttrs const &) { - return lift_task_id_t(task_id_t::ELEMENTBINARY_FWD_TASK_ID); + return lift_task_id_t(task_id_t::ELEMENTUNARY_FWD_TASK_ID); }, [](EmbeddingAttrs const &) { return lift_task_id_t(task_id_t::EMBED_FWD_TASK_ID); @@ -190,7 +190,7 @@ task_id_with_noop_default_t return lift_task_id_t(task_id_t::ELEMENTBINARY_BWD_TASK_ID); }, [](ElementUnaryAttrs const &) { - return lift_task_id_t(task_id_t::ELEMENTBINARY_BWD_TASK_ID); + return lift_task_id_t(task_id_t::ELEMENTUNARY_BWD_TASK_ID); }, [](EmbeddingAttrs const &) { return lift_task_id_t(task_id_t::EMBED_BWD_TASK_ID);