Skip to content

Commit d38e1f0

Browse files
committed
[BatchMode] <rdar://41271283> Limit memory pressure on large modules.
1 parent 5696fb5 commit d38e1f0

File tree

6 files changed

+180
-7
lines changed

6 files changed

+180
-7
lines changed

include/swift/Driver/Compilation.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,9 +181,14 @@ class Compilation {
181181
/// Provides a randomization seed to batch-mode partitioning, for debugging.
182182
const unsigned BatchSeed;
183183

184-
/// Overrides parallelism level as count of batches, if in batch-mode.
184+
/// Overrides parallelism level and \c BatchSizeLimit, sets exact
185+
/// count of batches, if in batch-mode.
185186
const Optional<unsigned> BatchCount;
186187

188+
/// Overrides maximum batch size, if in batch-mode and not overridden
189+
/// by \c BatchCount.
190+
const Optional<unsigned> BatchSizeLimit;
191+
187192
/// In order to test repartitioning, set to true if
188193
/// -driver-force-one-batch-repartition is present.
189194
const bool ForceOneBatchRepartition = false;
@@ -240,6 +245,7 @@ class Compilation {
240245
bool EnableBatchMode = false,
241246
unsigned BatchSeed = 0,
242247
Optional<unsigned> BatchCount = None,
248+
Optional<unsigned> BatchSizeLimit = None,
243249
bool ForceOneBatchRepartition = false,
244250
bool SkipTaskExecution = false,
245251
bool SaveTemps = false,

include/swift/Option/Options.td

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,10 @@ def driver_batch_seed : Separate<["-"], "driver-batch-seed">,
112112
HelpText<"Use the given seed value to randomize batch-mode partitions">;
113113
def driver_batch_count : Separate<["-"], "driver-batch-count">,
114114
InternalDebugOpt,
115-
HelpText<"Use the given number of batch-mode partitions, rather than default parallelism level">;
115+
HelpText<"Use the given number of batch-mode partitions, rather than partitioning dynamically">;
116+
def driver_batch_size_limit : Separate<["-"], "driver-batch-size-limit">,
117+
InternalDebugOpt,
118+
HelpText<"Use the given number as the upper limit on dynamic batch-mode partition size">;
116119
def driver_force_one_batch_repartition : Flag<["-"], "driver-force-one-batch-repartition">,
117120
InternalDebugOpt,
118121
HelpText<"Force one batch repartitioning for testing">;

lib/Driver/Compilation.cpp

Lines changed: 114 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ Compilation::Compilation(DiagnosticEngine &Diags,
115115
bool EnableBatchMode,
116116
unsigned BatchSeed,
117117
Optional<unsigned> BatchCount,
118+
Optional<unsigned> BatchSizeLimit,
118119
bool ForceOneBatchRepartition,
119120
bool SkipTaskExecution,
120121
bool SaveTemps,
@@ -138,6 +139,7 @@ Compilation::Compilation(DiagnosticEngine &Diags,
138139
EnableBatchMode(EnableBatchMode),
139140
BatchSeed(BatchSeed),
140141
BatchCount(BatchCount),
142+
BatchSizeLimit(BatchSizeLimit),
141143
ForceOneBatchRepartition(ForceOneBatchRepartition),
142144
SaveTemps(SaveTemps),
143145
ShowDriverTimeCompilation(ShowDriverTimeCompilation),
@@ -920,6 +922,117 @@ namespace driver {
920922
return false;
921923
}
922924

925+
// Selects the number of partitions based on the user-provided batch
926+
// count and/or the number of parallel tasks we can run, subject to a
927+
// fixed per-batch safety cap, to avoid overcommitting memory.
928+
size_t pickNumberOfPartitions() {
929+
930+
// If the user asked for something, use that.
931+
if (Comp.BatchCount.hasValue())
932+
return Comp.BatchCount.getValue();
933+
934+
// This is a long comment to justify a simple calculation.
935+
//
936+
// Because there is a secondary "outer" build system potentially also
937+
// scheduling multiple drivers in parallel on separate build targets
938+
// -- while we, the driver, schedule our own subprocesses -- we might
939+
// be creating up to $NCPU^2 worth of _memory pressure_.
940+
//
941+
// Oversubscribing CPU is typically no problem these days, but
942+
// oversubscribing memory can lead to paging, which on modern systems
943+
// is quite bad.
944+
//
945+
// In practice, $NCPU^2 processes doesn't _quite_ happen: as core
946+
// count rises, it usually exceeds the number of large targets
947+
// without any dependencies between them (which are the only thing we
948+
// have to worry about): you might have (say) 2 large independent
949+
// modules * 2 architectures, but that's only an $NTARGET value of 4,
950+
// which is much less than $NCPU if you're on a 24 or 36-way machine.
951+
//
952+
// So the actual number of concurrent processes is:
953+
//
954+
// NCONCUR := $NCPU * min($NCPU, $NTARGET)
955+
//
956+
// Empirically, a frontend uses about 512kb RAM per non-primary file
957+
// and about 10mb per primary. The number of non-primaries per
958+
// process is a constant in a given module, but the number of
959+
// primaries -- the "batch size" -- is inversely proportional to the
960+
// batch count (default: $NCPU). As a result, the memory pressure
961+
// we can expect is:
962+
//
963+
// $NCONCUR * (($NONPRIMARYMEM * $NFILE) +
964+
// ($PRIMARYMEM * ($NFILE/$NCPU)))
965+
//
966+
// If we tabulate this across some plausible values, we see
967+
// unfortunate memory-pressure results:
968+
//
969+
// $NFILE
970+
// +---------------------
971+
// $NTARGET $NCPU | 100 500 1000
972+
// ----------------+---------------------
973+
// 2 2 | 2gb 11gb 22gb
974+
// 4 4 | 4gb 24gb 48gb
975+
// 4 8 | 5gb 28gb 56gb
976+
// 4 16 | 7gb 36gb 72gb
977+
// 4 36 | 11gb 56gb 112gb
978+
//
979+
// As it happens, the lower parts of the table are dominated by
980+
// number of processes rather than the files-per-batch (the batches
981+
// are already quite small due to the high core count) and the left
982+
// side of the table is dealing with modules too small to worry
983+
// about. But the middle and upper-right quadrant is problematic: 4
984+
// and 8 core machines do not typically have 24-48gb of RAM, it'd be
985+
// nice not to page on them when building a 4-target project with
986+
// 500-file modules.
987+
//
988+
// Turns we can do that if we just cap the batch size statically at,
989+
// say, 25 files per batch, we get a better formula:
990+
//
991+
// $NCONCUR * (($NONPRIMARYMEM * $NFILE) +
992+
// ($PRIMARYMEM * min(25, ($NFILE/$NCPU))))
993+
//
994+
// $NFILE
995+
// +---------------------
996+
// $NTARGET $NCPU | 100 500 1000
997+
// ----------------+---------------------
998+
// 2 2 | 1gb 2gb 3gb
999+
// 4 4 | 4gb 8gb 12gb
1000+
// 4 8 | 5gb 16gb 24gb
1001+
// 4 16 | 7gb 32gb 48gb
1002+
// 4 36 | 11gb 56gb 108gb
1003+
//
1004+
// This means that the "performance win" of batch mode diminishes
1005+
// slightly: the batching factor in the equation drops from
1006+
// ($NFILE/$NCPU) to min(25, $NFILE/$NCPU). In practice this seems to
1007+
// not cost too much: the additional factor in number of subprocesses
1008+
// run is the following:
1009+
//
1010+
// $NFILE
1011+
// +---------------------
1012+
// $NTARGET $NCPU | 100 500 1000
1013+
// ----------------+---------------------
1014+
// 2 2 | 2x 10x 20x
1015+
// 4 4 | - 5x 10x
1016+
// 4 8 | - 2.5x 5x
1017+
// 4 16 | - 1.25x 2.5x
1018+
// 4 36 | - - 1.1x
1019+
//
1020+
// Where - means "no difference" because the batches were already
1021+
// smaller than 25.
1022+
//
1023+
// Even in the worst case here, the 1000-file module on 2-core
1024+
// machine is being built with only 40 subprocesses, rather than the
1025+
// pre-batch-mode 1000. I.e. it's still running 96% fewer
1026+
// subprocesses than before. And significantly: it's doing so while
1027+
// not exceeding the RAM of a typical 2-core laptop.
1028+
1029+
size_t DefaultSizeLimit = 25;
1030+
size_t NumTasks = TQ->getNumberOfParallelTasks();
1031+
size_t NumFiles = PendingExecution.size();
1032+
size_t SizeLimit = Comp.BatchSizeLimit.getValueOr(DefaultSizeLimit);
1033+
return std::max(NumTasks, NumFiles / SizeLimit);
1034+
}
1035+
9231036
/// Select jobs that are batch-combinable from \c PendingExecution, combine
9241037
/// them together into \p BatchJob instances (also inserted into \p
9251038
/// BatchJobs), and enqueue all \c PendingExecution jobs (whether batched or
@@ -933,9 +1046,7 @@ namespace driver {
9331046
return;
9341047
}
9351048

936-
size_t NumPartitions = (Comp.BatchCount.hasValue() ?
937-
Comp.BatchCount.getValue() :
938-
TQ->getNumberOfParallelTasks());
1049+
size_t NumPartitions = pickNumberOfPartitions();
9391050
CommandSetVector Batchable, NonBatchable;
9401051
std::vector<const Job *> Batches;
9411052
bool PretendTheCommandLineIsTooLongOnce =

lib/Driver/Driver.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,22 @@ getDriverBatchCount(llvm::opt::InputArgList &ArgList,
646646
return None;
647647
}
648648

649+
static Optional<unsigned>
650+
getDriverBatchSizeLimit(llvm::opt::InputArgList &ArgList,
651+
DiagnosticEngine &Diags)
652+
{
653+
if (const Arg *A = ArgList.getLastArg(options::OPT_driver_batch_size_limit)) {
654+
unsigned Limit = 0;
655+
if (StringRef(A->getValue()).getAsInteger(10, Limit)) {
656+
Diags.diagnose(SourceLoc(), diag::error_invalid_arg_value,
657+
A->getAsString(ArgList), A->getValue());
658+
} else {
659+
return Limit;
660+
}
661+
}
662+
return None;
663+
}
664+
649665
std::unique_ptr<Compilation>
650666
Driver::buildCompilation(const ToolChain &TC,
651667
std::unique_ptr<llvm::opt::InputArgList> ArgList) {
@@ -671,6 +687,8 @@ Driver::buildCompilation(const ToolChain &TC,
671687
ArgList->hasArg(options::OPT_driver_show_job_lifecycle);
672688
unsigned DriverBatchSeed = getDriverBatchSeed(*ArgList, Diags);
673689
Optional<unsigned> DriverBatchCount = getDriverBatchCount(*ArgList, Diags);
690+
Optional<unsigned> DriverBatchSizeLimit =
691+
getDriverBatchSizeLimit(*ArgList, Diags);
674692
bool DriverForceOneBatchRepartition =
675693
ArgList->hasArg(options::OPT_driver_force_one_batch_repartition);
676694

@@ -850,6 +868,7 @@ Driver::buildCompilation(const ToolChain &TC,
850868
BatchMode,
851869
DriverBatchSeed,
852870
DriverBatchCount,
871+
DriverBatchSizeLimit,
853872
DriverForceOneBatchRepartition,
854873
DriverSkipExecution,
855874
SaveTemps,

test/Driver/batch_mode_overlong_argv.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@
103103
// RUN: touch %t/f_100_1.swift %t/f_100_2.swift %t/f_100_3.swift %t/f_100_4.swift %t/f_100_5.swift %t/f_100_6.swift %t/f_100_7.swift %t/f_100_8.swift %t/f_100_9.swift %t/f_100_10.swift
104104
// RUN: mkdir -p %t/additional/path/elements/often/make/filenames/longer/than/one/might/expect/especially/given/output/directories/deep/within/a/derived/data/folder/of/a/CI/machine/
105105
// Force the repartitioning:
106-
// RUN: %swiftc_driver -driver-show-job-lifecycle -driver-force-one-batch-repartition -v -c -module-name foo -o %t/additional/path/elements/often/make/filenames/longer/than/one/might/expect/especially/given/output/directories/deep/within/a/derived/data/folder/of/a/CI/machine/foo.o -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out.txt 2>&1
106+
// RUN: %swiftc_driver -driver-show-job-lifecycle -driver-batch-size-limit 10000 -driver-force-one-batch-repartition -v -c -module-name foo -o %t/additional/path/elements/often/make/filenames/longer/than/one/might/expect/especially/given/output/directories/deep/within/a/derived/data/folder/of/a/CI/machine/foo.o -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out.txt 2>&1
107107
// RUN: %FileCheck %s <%t/out.txt
108108
// CHECK-NOT: unable to execute command
109109
// CHECK: Forming into 1 batches
@@ -113,7 +113,7 @@
113113
// CHECK: Forming batch job from 500 constituents
114114
//
115115
// Try it without the force; supplementary output file maps should obviate the repartition:
116-
// RUN: %swiftc_driver -driver-show-job-lifecycle -v -c -module-name foo -o %t/additional/path/elements/often/make/filenames/longer/than/one/might/expect/especially/given/output/directories/deep/within/a/derived/data/folder/of/a/CI/machine/foo.o -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out2.txt 2>&1
116+
// RUN: %swiftc_driver -driver-show-job-lifecycle -driver-batch-size-limit 10000 -v -c -module-name foo -o %t/additional/path/elements/often/make/filenames/longer/than/one/might/expect/especially/given/output/directories/deep/within/a/derived/data/folder/of/a/CI/machine/foo.o -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out2.txt 2>&1
117117
// RUN: %FileCheck %s <%t/out2.txt -check-prefix=NO-REPARTITION
118118
// CHECK-NOT: unable to execute command
119119
// NO-REPARTITION: Forming into 1 batches
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
// RUN: %empty-directory(%t)
2+
// RUN: touch %t/f_1_1.swift %t/f_1_2.swift %t/f_1_3.swift %t/f_1_4.swift %t/f_1_5.swift %t/f_1_6.swift %t/f_1_7.swift %t/f_1_8.swift %t/f_1_9.swift %t/f_1_10.swift
3+
// RUN: touch %t/f_2_1.swift %t/f_2_2.swift %t/f_2_3.swift %t/f_2_4.swift %t/f_2_5.swift %t/f_2_6.swift %t/f_2_7.swift %t/f_2_8.swift %t/f_2_9.swift %t/f_2_10.swift
4+
// RUN: touch %t/f_3_1.swift %t/f_3_2.swift %t/f_3_3.swift %t/f_3_4.swift %t/f_3_5.swift %t/f_3_6.swift %t/f_3_7.swift %t/f_3_8.swift %t/f_3_9.swift %t/f_3_10.swift
5+
// RUN: touch %t/f_4_1.swift %t/f_4_2.swift %t/f_4_3.swift %t/f_4_4.swift %t/f_4_5.swift %t/f_4_6.swift %t/f_4_7.swift %t/f_4_8.swift %t/f_4_9.swift %t/f_4_10.swift
6+
// RUN: touch %t/f_5_1.swift %t/f_5_2.swift %t/f_5_3.swift %t/f_5_4.swift %t/f_5_5.swift %t/f_5_6.swift %t/f_5_7.swift %t/f_5_8.swift %t/f_5_9.swift %t/f_5_10.swift
7+
// RUN: touch %t/f_6_1.swift %t/f_6_2.swift %t/f_6_3.swift %t/f_6_4.swift %t/f_6_5.swift %t/f_6_6.swift %t/f_6_7.swift %t/f_6_8.swift %t/f_6_9.swift %t/f_6_10.swift
8+
// RUN: touch %t/f_7_1.swift %t/f_7_2.swift %t/f_7_3.swift %t/f_7_4.swift %t/f_7_5.swift %t/f_7_6.swift %t/f_7_7.swift %t/f_7_8.swift %t/f_7_9.swift %t/f_7_10.swift
9+
// RUN: touch %t/f_8_1.swift %t/f_8_2.swift %t/f_8_3.swift %t/f_8_4.swift %t/f_8_5.swift %t/f_8_6.swift %t/f_8_7.swift %t/f_8_8.swift %t/f_8_9.swift %t/f_8_10.swift
10+
// RUN: touch %t/f_9_1.swift %t/f_9_2.swift %t/f_9_3.swift %t/f_9_4.swift %t/f_9_5.swift %t/f_9_6.swift %t/f_9_7.swift %t/f_9_8.swift %t/f_9_9.swift %t/f_9_10.swift
11+
// RUN: touch %t/f_10_1.swift %t/f_10_2.swift %t/f_10_3.swift %t/f_10_4.swift %t/f_10_5.swift %t/f_10_6.swift %t/f_10_7.swift %t/f_10_8.swift %t/f_10_9.swift %t/f_10_10.swift
12+
// RUN: %swiftc_driver -driver-show-job-lifecycle -v -c -module-name foo -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out.txt 2>&1
13+
// RUN: %FileCheck %s <%t/out.txt
14+
// CHECK-NOT: unable to execute command
15+
// CHECK: Forming into 4 batches
16+
// CHECK: Forming batch job from 25 constituents
17+
// CHECK: Forming batch job from 25 constituents
18+
// CHECK: Forming batch job from 25 constituents
19+
// CHECK: Forming batch job from 25 constituents
20+
//
21+
// RUN: %swiftc_driver -driver-show-job-lifecycle -driver-batch-size-limit 10 -v -c -module-name foo -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out.txt 2>&1
22+
// RUN: %FileCheck %s <%t/out.txt -check-prefix=EXPLICIT-ARG
23+
// EXPLICIT-ARG-NOT: unable to execute command
24+
// EXPLICIT-ARG: Forming into 10 batches
25+
// EXPLICIT-ARG: Forming batch job from 10 constituents
26+
// EXPLICIT-ARG: Forming batch job from 10 constituents
27+
// EXPLICIT-ARG: Forming batch job from 10 constituents
28+
// EXPLICIT-ARG: Forming batch job from 10 constituents
29+
// EXPLICIT-ARG: Forming batch job from 10 constituents
30+
// EXPLICIT-ARG: Forming batch job from 10 constituents
31+
// EXPLICIT-ARG: Forming batch job from 10 constituents
32+
// EXPLICIT-ARG: Forming batch job from 10 constituents
33+
// EXPLICIT-ARG: Forming batch job from 10 constituents
34+
// EXPLICIT-ARG: Forming batch job from 10 constituents

0 commit comments

Comments
 (0)