Skip to content

Commit 0d2ffdb

Browse files
victorli2002lockshawVictor Li
authored andcommitted
Merge substitution-builder (flexflow#1575)
* Start on pcg builder * Add tests and some implementation for pcg builder * Add pcg tests, make dtgen constructors explicit to fix bug * Add remainder of PCG tests * Fix build issues in local-execution * Format * Address Reyna comments, add topological_order function for PCG * Pre multidigraph refactor * Removing visitable from sp code * Add open dataflow graph, start to replace pcg dataflow graph * Start refactoring substitutions * Add utility functions to support pattern matching * Pre-refactor inputs * Fix proj url * Get back to substitutions, now with unordered graph inputs * Get substitutions building * substitutions-tests now builds * Fix bug in filter, pass some initial substitution tests * Add tests for fmt::to_string, fix some substitutions bugs * Pass initial unit tests for find_pattern_matches * Start on unit tests for pcg pattern * Pass initial test for find_pattern_matches * Fix small build issue in tests * Format * Sync tests in CI with tests in proj * Fix minor build errors in kernels and local-execution * Format * Remove outdated code * More outdated code removal * More cleanup, add test for sp decomposition * Pull apart containers.h * More sp testing and fixes * Break up graph algorithms.h * Pre- full SP algo commit * Add initial implementation and tests for cbc decomposition and inverse line graph * Pass test for get_inverse_line_graph * Add new multidigraph * Fix get_inverse_line_graph to return a MultiDiGraph instead of a DiGraph * Add tests for parallel and series reduction finding * Add really rough implementation of valdez sp decomposition * Fix local-execution build * Add implementations and tests for applying series/parallel reductions * Format * Clean up sp decomposition interface and tests * Format * Add comments for top-level substitutions functions, add proj doxygen support * Start sketching out substitutions code * Fix build errors * Add ability to permute node ids * Cleanup and start to test new substitutions code * Add test case for evaluate_substitution_output * Add naive isomorphism detection code * Add graph inputs to open dataflow graph isomorphism * Add input permutation to evaluate_substitution_output * Fix permute_node_ids * Add test for permute_input_ids * Migrate over to mutable implementation of apply_substitution * Add fast isomorphism checking and an initial implementation of full substitution logic * Pass initial full substitutions test * Cleanup old isomorphism checking code * Fix post-merge bugs * Fix broken pcg builder test * Format * Reorganize code and remove some outdated code pre-code-review * Format * Restarting work on this after working on export-model-arch * Adding in some a simple function to get the currently available substritutions * nonnegative_int additions, code cleanup, etc. * A bunch more moving over to nonnegative_int * Even more nonnegative_int updating * Fix build * Fix failing tests * Format * Format --------- Co-authored-by: Colin Unger <[email protected]> Co-authored-by: Victor Li <[email protected]>
1 parent 209db7e commit 0d2ffdb

File tree

423 files changed

+7336
-5040
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

423 files changed

+7336
-5040
lines changed

bin/export-model-arch/src/export_model_arch.cc

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "utils/cli/cli_parse.h"
1414
#include "utils/cli/cli_parse_result.h"
1515
#include "utils/cli/cli_spec.h"
16+
#include "utils/graph/open_dataflow_graph/algorithms/as_dot.h"
1617
#include "utils/graph/series_parallel/binary_sp_decomposition_tree/right_associative_binary_sp_tree_from_nary.h"
1718
#include "utils/graph/series_parallel/get_series_parallel_decomposition.h"
1819

@@ -21,11 +22,11 @@ using namespace ::FlexFlow;
2122
ComputationGraph get_single_operator_computation_graph() {
2223
ComputationGraphBuilder b;
2324

24-
size_t batch_size = 8;
25-
size_t in_channels = 16;
26-
size_t out_channels = 12;
25+
nonnegative_int batch_size = 8_n;
26+
nonnegative_int in_channels = 16_n;
27+
nonnegative_int out_channels = 12_n;
2728
TensorShape input_shape = TensorShape{
28-
TensorDims{FFOrdered<size_t>{
29+
TensorDims{FFOrdered<nonnegative_int>{
2930
batch_size,
3031
in_channels,
3132
out_channels,
@@ -69,7 +70,7 @@ tl::expected<ComputationGraph, std::string>
6970
} else if (model_name == "bert") {
7071
return get_bert_computation_graph(get_default_bert_config());
7172
} else if (model_name == "split_test") {
72-
int batch_size = 8;
73+
nonnegative_int batch_size = 8_n;
7374
return get_split_test_computation_graph(batch_size);
7475
} else if (model_name == "single_operator") {
7576
return get_single_operator_computation_graph();
@@ -100,10 +101,10 @@ tl::expected<JsonSPModelExport, std::string>
100101
result.value();
101102
});
102103

103-
std::pair<V1ComputationGraph, bidict<int, layer_guid_t>> v1_result =
104-
to_v1_including_node_numbering(computation_graph);
104+
std::pair<V1ComputationGraph, bidict<nonnegative_int, layer_guid_t>>
105+
v1_result = to_v1_including_node_numbering(computation_graph);
105106
V1ComputationGraph v1_cg = v1_result.first;
106-
bidict<int, layer_guid_t> layer_numbering = v1_result.second;
107+
bidict<nonnegative_int, layer_guid_t> layer_numbering = v1_result.second;
107108
V1BinarySPDecomposition v1_sp_decomposition =
108109
to_v1(sp_decomposition, layer_numbering);
109110

cmake/flexflow-utils.cmake

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ function(define_ff_vars target)
2020
MAX_TENSOR_DIM=${FF_MAX_DIM}
2121
MAX_NUM_TASK_REGIONS=${FF_MAX_NUM_TASK_REGIONS}
2222
MAX_NUM_TASK_ARGUMENTS=${FF_MAX_NUM_TASK_ARGUMENTS}
23+
# _FORTIFY_SOURCE=0
2324
)
2425

2526
if (FF_GPU_BACKEND STREQUAL "cuda")
@@ -39,7 +40,18 @@ function(ff_set_cxx_properties target)
3940
CXX_EXTENSIONS NO
4041
)
4142
target_compile_options(${target}
42-
PRIVATE $<$<COMPILE_LANGUAGE:CXX>:> "-ffile-prefix-map=${CMAKE_SOURCE_DIR}=." # add C++ compile flags here
43+
PUBLIC
44+
$<$<COMPILE_LANGUAGE:CXX>:>
45+
"-ffile-prefix-map=${CMAKE_SOURCE_DIR}=."
46+
"-fsanitize=undefined"
47+
"-fno-sanitize-recover=all"
48+
# add C++ compile flags here
49+
)
50+
target_link_options(${target}
51+
PUBLIC
52+
$<$<COMPILE_LANGUAGE:CXX>:>
53+
"-fsanitize=undefined"
54+
"-fno-sanitize-recover=all"
4355
)
4456
endfunction()
4557

flake.nix

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,15 @@
3838
};
3939
lib = pkgs.lib;
4040

41-
mkShell = pkgs.mkShell.override {
41+
mkShell = attrs: pkgs.mkShell.override {
4242
stdenv = pkgs.cudaPackages.backendStdenv;
43-
};
43+
} (attrs // {
44+
hardeningDisable = ["all"]; # disable nixpkgs default compiler arguments, otherwise ubsan doesn't catch
45+
# signed overflows due to the signedoverflow hardening setting.
46+
# for more details, see the following (long-running) nixpkgs github issues:
47+
# - https://github.com/NixOS/nixpkgs/issues/18995
48+
# - https://github.com/NixOS/nixpkgs/issues/60919
49+
});
4450

4551
proj = proj-repo.packages.${system}.proj;
4652
in
@@ -121,6 +127,8 @@
121127

122128
gpu-ci = mkShell {
123129
inputsFrom = [ ci ];
130+
hardeningDisable = [ "all" ];
131+
124132
buildInputs = builtins.concatLists [
125133
(with nixGL.packages.${system}; [
126134
nixGLDefault
@@ -135,6 +143,8 @@
135143
"${proj-repo.packages.${system}.proj-nvim}"
136144
];
137145

146+
hardeningDisable = [ "all" ];
147+
138148
buildInputs = builtins.concatLists [
139149
(with pkgs; [
140150
clang-tools

lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,9 @@ bool is_right_associative(ComputationGraphBinarySPDecomposition const &);
3636
std::unordered_multiset<layer_guid_t>
3737
get_layers(ComputationGraphBinarySPDecomposition const &);
3838

39-
V1BinarySPDecomposition to_v1(ComputationGraphBinarySPDecomposition const &,
40-
bidict<int, layer_guid_t> const &layer_numbering);
39+
V1BinarySPDecomposition
40+
to_v1(ComputationGraphBinarySPDecomposition const &,
41+
bidict<nonnegative_int, layer_guid_t> const &layer_numbering);
4142

4243
} // namespace FlexFlow
4344

lib/compiler/src/compiler/allowed_machine_views.cc

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,15 @@
1111
#include "utils/containers/map_from_keys_and_values.h"
1212
#include "utils/containers/product.h"
1313
#include "utils/containers/range.h"
14-
#include "utils/containers/replicate.h"
14+
#include "utils/containers/repeat_element.h"
1515
#include "utils/containers/sorted.h"
1616
#include "utils/containers/transform.h"
1717
#include "utils/containers/unordered_multiset_of.h"
1818
#include "utils/containers/unordered_set_of.h"
1919
#include "utils/containers/zip.h"
20+
#include "utils/nonnegative_int/ceildiv.h"
21+
#include "utils/nonnegative_int/nonnegative_range.h"
22+
#include "utils/nonnegative_int/num_elements.h"
2023
#include "utils/overload.h"
2124

2225
namespace FlexFlow {
@@ -47,24 +50,29 @@ static std::unordered_set<MachineView>
4750
OperatorTaskSpace const &task,
4851
DeviceType const &device_type) {
4952

50-
auto get_max_stride_upper_bound = [](std::vector<int> const &tensor_dims,
51-
int total_devices) -> int {
52-
int min_num_devices_with_full_stride_volume = product(transform(
53-
tensor_dims, [](int const &num_devices) { return num_devices - 1; }));
54-
return std::ceil(total_devices / min_num_devices_with_full_stride_volume);
53+
auto get_max_stride_upper_bound =
54+
[](std::vector<nonnegative_int> const &tensor_dims,
55+
nonnegative_int total_devices) -> nonnegative_int {
56+
nonnegative_int min_num_devices_with_full_stride_volume =
57+
product(transform(tensor_dims, [](nonnegative_int num_devices) {
58+
return nonnegative_int{num_devices.unwrap_nonnegative() - 1};
59+
}));
60+
return ceildiv(total_devices, min_num_devices_with_full_stride_volume);
5561
};
5662

57-
auto candidate_strides = [&](std::vector<int> const &tensor_dims,
58-
int total_devices)
63+
auto candidate_strides = [&](std::vector<nonnegative_int> const &tensor_dims,
64+
nonnegative_int total_devices)
5965
-> std::unordered_multiset<MultiDimensionalStride> {
60-
int max_stride_upper_bound =
66+
nonnegative_int max_stride_upper_bound =
6167
get_max_stride_upper_bound(tensor_dims, total_devices);
6268

6369
std::vector<stride_t> single_stride_range =
64-
transform(range(1, max_stride_upper_bound + 1),
65-
[](int stride) { return stride_t{stride}; });
70+
transform(nonnegative_range(1_n, max_stride_upper_bound + 1_n),
71+
[](nonnegative_int stride) { return stride_t{stride}; });
6672
std::unordered_multiset<std::vector<stride_t>> raw_stride_vectors =
67-
cartesian_product(replicate(tensor_dims.size(), single_stride_range));
73+
cartesian_product(
74+
repeat_element(/*num_times=*/num_elements(tensor_dims),
75+
/*element=*/single_stride_range));
6876
std::unordered_multiset<MultiDimensionalStride> strides =
6977
transform(raw_stride_vectors, [](auto const &stride_vec) {
7078
return MultiDimensionalStride{stride_vec};
@@ -75,8 +83,9 @@ static std::unordered_set<MachineView>
7583
auto candidate_starts = [](MachineSpecification const &ms,
7684
DeviceType const &device_type) {
7785
std::unordered_set<MachineSpaceCoordinate> result;
78-
for (int node_idx : range(ms.num_nodes)) {
79-
for (int device_idx : range(get_num_devices_per_node(ms, device_type))) {
86+
for (nonnegative_int node_idx : nonnegative_range(ms.num_nodes)) {
87+
for (nonnegative_int device_idx :
88+
nonnegative_range(get_num_devices_per_node(ms, device_type))) {
8089
result.insert(
8190
MachineSpaceCoordinate{node_idx, device_idx, device_type});
8291
}
@@ -91,8 +100,8 @@ static std::unordered_set<MachineView>
91100
return get_all_permutations_with_repetition(options, num_dims(task));
92101
};
93102

94-
std::vector<int> tensor_dims = task.degrees;
95-
int total_devices = get_num_devices(machine_spec, device_type);
103+
std::vector<nonnegative_int> tensor_dims = task.degrees;
104+
nonnegative_int total_devices = get_num_devices(machine_spec, device_type);
96105

97106
std::unordered_set<MachineView> machine_views;
98107

lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,19 @@ std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
1111
for (int i = 1; i < resource.num_nodes; i *= 2) {
1212
MachineSpecification sub_resource1 = resource;
1313
MachineSpecification sub_resource2 = resource;
14-
sub_resource1.num_nodes = i;
15-
sub_resource2.num_nodes = resource.num_nodes - i;
14+
sub_resource1.num_nodes = nonnegative_int{i};
15+
sub_resource2.num_nodes =
16+
nonnegative_int{resource.num_nodes.unwrap_nonnegative() - i};
1617
result.insert(std::make_pair(sub_resource1, sub_resource2));
1718
result.insert(std::make_pair(sub_resource2, sub_resource1));
1819
}
1920

2021
for (int i = 1; i < resource.num_gpus_per_node; i *= 2) {
2122
MachineSpecification sub_resource1 = resource;
2223
MachineSpecification sub_resource2 = resource;
23-
sub_resource1.num_gpus_per_node = i;
24-
sub_resource2.num_gpus_per_node = resource.num_gpus_per_node - i;
24+
sub_resource1.num_gpus_per_node = nonnegative_int{i};
25+
sub_resource2.num_gpus_per_node =
26+
nonnegative_int{resource.num_gpus_per_node.unwrap_nonnegative() - i};
2527
result.insert(std::make_pair(sub_resource1, sub_resource2));
2628
result.insert(std::make_pair(sub_resource2, sub_resource1));
2729
}

lib/compiler/src/compiler/machine_mapping/machine_mapping.cc

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,14 @@
11
#include "compiler/machine_mapping/machine_mapping.h"
2-
#include "pcg/machine_specification.h"
3-
#include "pcg/machine_view.h"
4-
#include "pcg/operator_task_space.dtg.h"
5-
#include "pcg/operator_task_space.h"
6-
#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
72
#include "utils/containers/are_disjoint.h"
8-
#include "utils/containers/get_one_of.h"
93
#include "utils/containers/keys.h"
10-
#include "utils/containers/map_values.h"
114
#include "utils/containers/merge_maps.h"
125

136
namespace FlexFlow {
147

158
MachineMapping combine_disjoint_mappings(MachineMapping const &m1,
169
MachineMapping const &m2) {
17-
return MachineMapping{merge_maps(m1.machine_views, m2.machine_views)};
10+
return MachineMapping{
11+
merge_disjoint_maps(m1.machine_views, m2.machine_views)};
1812
}
1913

2014
bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) {

lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ ParallelLayerGuidObliviousMachineMapping binary_combine_mappings(
1010
ParallelLayerGuidObliviousMachineMapping const &lhs,
1111
ParallelLayerGuidObliviousMachineMapping const &rhs) {
1212
return ParallelLayerGuidObliviousMachineMapping{
13-
merge_maps(map_keys(lhs.raw_mapping, nest_inside_left_child),
14-
map_keys(rhs.raw_mapping, nest_inside_right_child)),
13+
merge_disjoint_maps(map_keys(lhs.raw_mapping, nest_inside_left_child),
14+
map_keys(rhs.raw_mapping, nest_inside_right_child)),
1515
};
1616
}
1717

lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ std::unordered_multiset<layer_guid_t>
164164

165165
V1BinarySPDecomposition
166166
to_v1(ComputationGraphBinarySPDecomposition const &tree,
167-
bidict<int, layer_guid_t> const &layer_numbering) {
167+
bidict<nonnegative_int, layer_guid_t> const &layer_numbering) {
168168
return tree.visit<V1BinarySPDecomposition>(
169169
overload{[&](ComputationGraphBinarySeriesSplit const &series) {
170170
return V1BinarySPDecomposition{

lib/compiler/test/src/allowed_machine_views.cc

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -15,39 +15,39 @@ TEST_SUITE(FF_TEST_SUITE) {
1515

1616
SUBCASE("1 degree of parallelism") {
1717
MachineSpecification ms = MachineSpecification{
18-
/*num_nodes=*/1,
19-
/*num_cpus_per_node=*/5,
20-
/*num_gpus_per_node=*/5,
18+
/*num_nodes=*/1_n,
19+
/*num_cpus_per_node=*/5_n,
20+
/*num_gpus_per_node=*/5_n,
2121
/*inter_node_bandwidth=*/0,
2222
/*intra_node_bandwidth=*/0,
2323
};
2424

25-
OperatorTaskSpace task = OperatorTaskSpace{{3}};
25+
OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
2626

2727
std::unordered_set<MachineView> correct = {
2828
MachineView{
2929
MachineSpaceCoordinate{
30-
/*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU},
31-
{MachineViewDimension{stride_t{1},
30+
/*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
31+
{MachineViewDimension{stride_t{1_n},
3232
MachineSpecificationDimension::INTRA_NODE}},
3333
},
3434

3535
MachineView{
3636
MachineSpaceCoordinate{
37-
/*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
38-
{MachineViewDimension{stride_t{1},
37+
/*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
38+
{MachineViewDimension{stride_t{1_n},
3939
MachineSpecificationDimension::INTRA_NODE}},
4040
},
4141
MachineView{
4242
MachineSpaceCoordinate{
43-
/*node_idx=*/0, /*device_idx=*/2, DeviceType::GPU},
44-
{MachineViewDimension{stride_t{1},
43+
/*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU},
44+
{MachineViewDimension{stride_t{1_n},
4545
MachineSpecificationDimension::INTRA_NODE}},
4646
},
4747
MachineView{
4848
MachineSpaceCoordinate{
49-
/*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU},
50-
{MachineViewDimension{stride_t{2},
49+
/*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
50+
{MachineViewDimension{stride_t{2_n},
5151
MachineSpecificationDimension::INTRA_NODE}},
5252
},
5353
};
@@ -61,18 +61,18 @@ TEST_SUITE(FF_TEST_SUITE) {
6161
SUBCASE("2 degrees of parallelism") {
6262

6363
MachineSpecification ms = MachineSpecification{
64-
/*num_nodes=*/3,
65-
/*num_cpus_per_node=*/3,
66-
/*num_gpus_per_node=*/3,
64+
/*num_nodes=*/3_n,
65+
/*num_cpus_per_node=*/3_n,
66+
/*num_gpus_per_node=*/3_n,
6767
/*inter_node_bandwidth=*/0,
6868
/*intra_node_bandwidth=*/0,
6969
};
70-
OperatorTaskSpace task = OperatorTaskSpace{{2, 3}};
70+
OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}};
7171

72-
auto make_2d_view = [&](int start_node_idx,
73-
int start_device_idx,
74-
int stride1,
75-
int stride2,
72+
auto make_2d_view = [&](nonnegative_int start_node_idx,
73+
nonnegative_int start_device_idx,
74+
nonnegative_int stride1,
75+
nonnegative_int stride2,
7676
MachineSpecificationDimension m1,
7777
MachineSpecificationDimension m2) {
7878
return MachineView{
@@ -86,13 +86,19 @@ TEST_SUITE(FF_TEST_SUITE) {
8686
auto intra = MachineSpecificationDimension::INTRA_NODE;
8787
auto inter = MachineSpecificationDimension::INTER_NODE;
8888
std::unordered_set<MachineView> correct = {
89-
make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, inter, intra),
90-
make_2d_view(1, 0, /*stride1=*/1, /*stride2=*/1, inter, intra),
91-
make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, inter, intra),
92-
93-
make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, intra, inter),
94-
make_2d_view(0, 1, /*stride1=*/1, /*stride2=*/1, intra, inter),
95-
make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, intra, inter),
89+
make_2d_view(
90+
0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
91+
make_2d_view(
92+
1_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
93+
make_2d_view(
94+
0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, inter, intra),
95+
96+
make_2d_view(
97+
0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
98+
make_2d_view(
99+
0_n, 1_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
100+
make_2d_view(
101+
0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, intra, inter),
96102
};
97103

98104
std::unordered_set<MachineView> result =

0 commit comments

Comments
 (0)