@@ -115,6 +115,7 @@ Compilation::Compilation(DiagnosticEngine &Diags,
115
115
bool EnableBatchMode,
116
116
unsigned BatchSeed,
117
117
Optional<unsigned > BatchCount,
118
+ Optional<unsigned > BatchSizeLimit,
118
119
bool ForceOneBatchRepartition,
119
120
bool SkipTaskExecution,
120
121
bool SaveTemps,
@@ -138,6 +139,7 @@ Compilation::Compilation(DiagnosticEngine &Diags,
138
139
EnableBatchMode(EnableBatchMode),
139
140
BatchSeed(BatchSeed),
140
141
BatchCount(BatchCount),
142
+ BatchSizeLimit(BatchSizeLimit),
141
143
ForceOneBatchRepartition(ForceOneBatchRepartition),
142
144
SaveTemps(SaveTemps),
143
145
ShowDriverTimeCompilation(ShowDriverTimeCompilation),
@@ -920,6 +922,117 @@ namespace driver {
920
922
return false ;
921
923
}
922
924
925
+ // Selects the number of partitions based on the user-provided batch
926
+ // count and/or the number of parallel tasks we can run, subject to a
927
+ // fixed per-batch safety cap, to avoid overcommitting memory.
928
+ size_t pickNumberOfPartitions () {
929
+
930
+ // If the user asked for something, use that.
931
+ if (Comp.BatchCount .hasValue ())
932
+ return Comp.BatchCount .getValue ();
933
+
934
+ // This is a long comment to justify a simple calculation.
935
+ //
936
+ // Because there is a secondary "outer" build system potentially also
937
+ // scheduling multiple drivers in parallel on separate build targets
938
+ // -- while we, the driver, schedule our own subprocesses -- we might
939
+ // be creating up to $NCPU^2 worth of _memory pressure_.
940
+ //
941
+ // Oversubscribing CPU is typically no problem these days, but
942
+ // oversubscribing memory can lead to paging, which on modern systems
943
+ // is quite bad.
944
+ //
945
+ // In practice, $NCPU^2 processes doesn't _quite_ happen: as core
946
+ // count rises, it usually exceeds the number of large targets
947
+ // without any dependencies between them (which are the only thing we
948
+ // have to worry about): you might have (say) 2 large independent
949
+ // modules * 2 architectures, but that's only an $NTARGET value of 4,
950
+ // which is much less than $NCPU if you're on a 24 or 36-way machine.
951
+ //
952
+ // So the actual number of concurrent processes is:
953
+ //
954
+ // NCONCUR := $NCPU * min($NCPU, $NTARGET)
955
+ //
956
+ // Empirically, a frontend uses about 512kb RAM per non-primary file
957
+ // and about 10mb per primary. The number of non-primaries per
958
+ // process is a constant in a given module, but the number of
959
+ // primaries -- the "batch size" -- is inversely proportional to the
960
+ // batch count (default: $NCPU). As a result, the memory pressure
961
+ // we can expect is:
962
+ //
963
+ // $NCONCUR * (($NONPRIMARYMEM * $NFILE) +
964
+ // ($PRIMARYMEM * ($NFILE/$NCPU)))
965
+ //
966
+ // If we tabulate this across some plausible values, we see
967
+ // unfortunate memory-pressure results:
968
+ //
969
+ // $NFILE
970
+ // +---------------------
971
+ // $NTARGET $NCPU | 100 500 1000
972
+ // ----------------+---------------------
973
+ // 2 2 | 2gb 11gb 22gb
974
+ // 4 4 | 4gb 24gb 48gb
975
+ // 4 8 | 5gb 28gb 56gb
976
+ // 4 16 | 7gb 36gb 72gb
977
+ // 4 36 | 11gb 56gb 112gb
978
+ //
979
+ // As it happens, the lower parts of the table are dominated by
980
+ // number of processes rather than the files-per-batch (the batches
981
+ // are already quite small due to the high core count) and the left
982
+ // side of the table is dealing with modules too small to worry
983
+ // about. But the middle and upper-right quadrant is problematic: 4
984
+ // and 8 core machines do not typically have 24-48gb of RAM, it'd be
985
+ // nice not to page on them when building a 4-target project with
986
+ // 500-file modules.
987
+ //
988
+ // Turns we can do that if we just cap the batch size statically at,
989
+ // say, 25 files per batch, we get a better formula:
990
+ //
991
+ // $NCONCUR * (($NONPRIMARYMEM * $NFILE) +
992
+ // ($PRIMARYMEM * min(25, ($NFILE/$NCPU))))
993
+ //
994
+ // $NFILE
995
+ // +---------------------
996
+ // $NTARGET $NCPU | 100 500 1000
997
+ // ----------------+---------------------
998
+ // 2 2 | 1gb 2gb 3gb
999
+ // 4 4 | 4gb 8gb 12gb
1000
+ // 4 8 | 5gb 16gb 24gb
1001
+ // 4 16 | 7gb 32gb 48gb
1002
+ // 4 36 | 11gb 56gb 108gb
1003
+ //
1004
+ // This means that the "performance win" of batch mode diminishes
1005
+ // slightly: the batching factor in the equation drops from
1006
+ // ($NFILE/$NCPU) to min(25, $NFILE/$NCPU). In practice this seems to
1007
+ // not cost too much: the additional factor in number of subprocesses
1008
+ // run is the following:
1009
+ //
1010
+ // $NFILE
1011
+ // +---------------------
1012
+ // $NTARGET $NCPU | 100 500 1000
1013
+ // ----------------+---------------------
1014
+ // 2 2 | 2x 10x 20x
1015
+ // 4 4 | - 5x 10x
1016
+ // 4 8 | - 2.5x 5x
1017
+ // 4 16 | - 1.25x 2.5x
1018
+ // 4 36 | - - 1.1x
1019
+ //
1020
+ // Where - means "no difference" because the batches were already
1021
+ // smaller than 25.
1022
+ //
1023
+ // Even in the worst case here, the 1000-file module on 2-core
1024
+ // machine is being built with only 40 subprocesses, rather than the
1025
+ // pre-batch-mode 1000. I.e. it's still running 96% fewer
1026
+ // subprocesses than before. And significantly: it's doing so while
1027
+ // not exceeding the RAM of a typical 2-core laptop.
1028
+
1029
+ size_t DefaultSizeLimit = 25 ;
1030
+ size_t NumTasks = TQ->getNumberOfParallelTasks ();
1031
+ size_t NumFiles = PendingExecution.size ();
1032
+ size_t SizeLimit = Comp.BatchSizeLimit .getValueOr (DefaultSizeLimit);
1033
+ return std::max (NumTasks, NumFiles / SizeLimit);
1034
+ }
1035
+
923
1036
// / Select jobs that are batch-combinable from \c PendingExecution, combine
924
1037
// / them together into \p BatchJob instances (also inserted into \p
925
1038
// / BatchJobs), and enqueue all \c PendingExecution jobs (whether batched or
@@ -933,9 +1046,7 @@ namespace driver {
933
1046
return ;
934
1047
}
935
1048
936
- size_t NumPartitions = (Comp.BatchCount .hasValue () ?
937
- Comp.BatchCount .getValue () :
938
- TQ->getNumberOfParallelTasks ());
1049
+ size_t NumPartitions = pickNumberOfPartitions ();
939
1050
CommandSetVector Batchable, NonBatchable;
940
1051
std::vector<const Job *> Batches;
941
1052
bool PretendTheCommandLineIsTooLongOnce =
0 commit comments