Merge pull request #18377 from graydon/rdar-41271283-batch-mode-limit-memory-pressure-on-large-modules-swift-4.2-branch

graydon · web-flow · commit dc70c694d7c6 · 2018-07-31T08:46:41.000-07:00
[4.2][BatchMode] &lt;rdar://41271283&gt; Limit memory pressure on large modules.
diff --git a/include/swift/Driver/Compilation.h b/include/swift/Driver/Compilation.h
@@ -181,9 +181,14 @@ class Compilation {
   /// Provides a randomization seed to batch-mode partitioning, for debugging.
   const unsigned BatchSeed;
 
-  /// Overrides parallelism level as count of batches, if in batch-mode.
+  /// Overrides parallelism level and \c BatchSizeLimit, sets exact
+  /// count of batches, if in batch-mode.
   const Optional<unsigned> BatchCount;
 
+  /// Overrides maximum batch size, if in batch-mode and not overridden
+  /// by \c BatchCount.
+  const Optional<unsigned> BatchSizeLimit;
+
   /// In order to test repartitioning, set to true if
   /// -driver-force-one-batch-repartition is present.
   const bool ForceOneBatchRepartition = false;
@@ -240,6 +245,7 @@ class Compilation {
               bool EnableBatchMode = false,
               unsigned BatchSeed = 0,
               Optional<unsigned> BatchCount = None,
+              Optional<unsigned> BatchSizeLimit = None,
               bool ForceOneBatchRepartition = false,
               bool SkipTaskExecution = false,
               bool SaveTemps = false,
diff --git a/include/swift/Option/Options.td b/include/swift/Option/Options.td
@@ -112,7 +112,10 @@ def driver_batch_seed : Separate<["-"], "driver-batch-seed">,
   HelpText<"Use the given seed value to randomize batch-mode partitions">;
 def driver_batch_count : Separate<["-"], "driver-batch-count">,
   InternalDebugOpt,
-  HelpText<"Use the given number of batch-mode partitions, rather than default parallelism level">;
+  HelpText<"Use the given number of batch-mode partitions, rather than partitioning dynamically">;
+def driver_batch_size_limit : Separate<["-"], "driver-batch-size-limit">,
+  InternalDebugOpt,
+  HelpText<"Use the given number as the upper limit on dynamic batch-mode partition size">;
 def driver_force_one_batch_repartition : Flag<["-"], "driver-force-one-batch-repartition">,
   InternalDebugOpt,
   HelpText<"Force one batch repartitioning for testing">;
diff --git a/lib/Driver/Compilation.cpp b/lib/Driver/Compilation.cpp
@@ -115,6 +115,7 @@ Compilation::Compilation(DiagnosticEngine &Diags,
                          bool EnableBatchMode,
                          unsigned BatchSeed,
                          Optional<unsigned> BatchCount,
+                         Optional<unsigned> BatchSizeLimit,
                          bool ForceOneBatchRepartition,
                          bool SkipTaskExecution,
                          bool SaveTemps,
@@ -138,6 +139,7 @@ Compilation::Compilation(DiagnosticEngine &Diags,
     EnableBatchMode(EnableBatchMode),
     BatchSeed(BatchSeed),
     BatchCount(BatchCount),
+    BatchSizeLimit(BatchSizeLimit),
     ForceOneBatchRepartition(ForceOneBatchRepartition),
     SaveTemps(SaveTemps),
     ShowDriverTimeCompilation(ShowDriverTimeCompilation),
@@ -920,6 +922,117 @@ namespace driver {
       return false;
     }
 
+    // Selects the number of partitions based on the user-provided batch
+    // count and/or the number of parallel tasks we can run, subject to a
+    // fixed per-batch safety cap, to avoid overcommitting memory.
+    size_t pickNumberOfPartitions() {
+
+      // If the user asked for something, use that.
+      if (Comp.BatchCount.hasValue())
+        return Comp.BatchCount.getValue();
+
+      // This is a long comment to justify a simple calculation.
+      //
+      // Because there is a secondary "outer" build system potentially also
+      // scheduling multiple drivers in parallel on separate build targets
+      // -- while we, the driver, schedule our own subprocesses -- we might
+      // be creating up to $NCPU^2 worth of _memory pressure_.
+      //
+      // Oversubscribing CPU is typically no problem these days, but
+      // oversubscribing memory can lead to paging, which on modern systems
+      // is quite bad.
+      //
+      // In practice, $NCPU^2 processes doesn't _quite_ happen: as core
+      // count rises, it usually exceeds the number of large targets
+      // without any dependencies between them (which are the only thing we
+      // have to worry about): you might have (say) 2 large independent
+      // modules * 2 architectures, but that's only an $NTARGET value of 4,
+      // which is much less than $NCPU if you're on a 24 or 36-way machine.
+      //
+      //  So the actual number of concurrent processes is:
+      //
+      //     NCONCUR := $NCPU * min($NCPU, $NTARGET)
+      //
+      // Empirically, a frontend uses about 512kb RAM per non-primary file
+      // and about 10mb per primary. The number of non-primaries per
+      // process is a constant in a given module, but the number of
+      // primaries -- the "batch size" -- is inversely proportional to the
+      // batch count (default: $NCPU). As a result, the memory pressure
+      // we can expect is:
+      //
+      //  $NCONCUR * (($NONPRIMARYMEM * $NFILE) +
+      //              ($PRIMARYMEM * ($NFILE/$NCPU)))
+      //
+      // If we tabulate this across some plausible values, we see
+      // unfortunate memory-pressure results:
+      //
+      //                          $NFILE
+      //                  +---------------------
+      //  $NTARGET $NCPU  |  100    500    1000
+      //  ----------------+---------------------
+      //     2        2   |  2gb   11gb    22gb
+      //     4        4   |  4gb   24gb    48gb
+      //     4        8   |  5gb   28gb    56gb
+      //     4       16   |  7gb   36gb    72gb
+      //     4       36   | 11gb   56gb   112gb
+      //
+      // As it happens, the lower parts of the table are dominated by
+      // number of processes rather than the files-per-batch (the batches
+      // are already quite small due to the high core count) and the left
+      // side of the table is dealing with modules too small to worry
+      // about. But the middle and upper-right quadrant is problematic: 4
+      // and 8 core machines do not typically have 24-48gb of RAM, it'd be
+      // nice not to page on them when building a 4-target project with
+      // 500-file modules.
+      //
+      // Turns we can do that if we just cap the batch size statically at,
+      // say, 25 files per batch, we get a better formula:
+      //
+      //  $NCONCUR * (($NONPRIMARYMEM * $NFILE) +
+      //              ($PRIMARYMEM * min(25, ($NFILE/$NCPU))))
+      //
+      //                          $NFILE
+      //                  +---------------------
+      //  $NTARGET $NCPU  |  100    500    1000
+      //  ----------------+---------------------
+      //     2        2   |  1gb    2gb     3gb
+      //     4        4   |  4gb    8gb    12gb
+      //     4        8   |  5gb   16gb    24gb
+      //     4       16   |  7gb   32gb    48gb
+      //     4       36   | 11gb   56gb   108gb
+      //
+      // This means that the "performance win" of batch mode diminishes
+      // slightly: the batching factor in the equation drops from
+      // ($NFILE/$NCPU) to min(25, $NFILE/$NCPU). In practice this seems to
+      // not cost too much: the additional factor in number of subprocesses
+      // run is the following:
+      //
+      //                          $NFILE
+      //                  +---------------------
+      //  $NTARGET $NCPU  |  100    500    1000
+      //  ----------------+---------------------
+      //     2        2   |  2x    10x      20x
+      //     4        4   |   -     5x      10x
+      //     4        8   |   -   2.5x       5x
+      //     4       16   |   -  1.25x     2.5x
+      //     4       36   |   -      -     1.1x
+      //
+      // Where - means "no difference" because the batches were already
+      // smaller than 25.
+      //
+      // Even in the worst case here, the 1000-file module on 2-core
+      // machine is being built with only 40 subprocesses, rather than the
+      // pre-batch-mode 1000. I.e. it's still running 96% fewer
+      // subprocesses than before. And significantly: it's doing so while
+      // not exceeding the RAM of a typical 2-core laptop.
+
+      size_t DefaultSizeLimit = 25;
+      size_t NumTasks = TQ->getNumberOfParallelTasks();
+      size_t NumFiles = PendingExecution.size();
+      size_t SizeLimit = Comp.BatchSizeLimit.getValueOr(DefaultSizeLimit);
+      return std::max(NumTasks, NumFiles / SizeLimit);
+    }
+
     /// Select jobs that are batch-combinable from \c PendingExecution, combine
     /// them together into \p BatchJob instances (also inserted into \p
     /// BatchJobs), and enqueue all \c PendingExecution jobs (whether batched or
@@ -933,9 +1046,7 @@ namespace driver {
         return;
       }
 
-      size_t NumPartitions = (Comp.BatchCount.hasValue() ?
-                              Comp.BatchCount.getValue() :
-                              TQ->getNumberOfParallelTasks());
+      size_t NumPartitions = pickNumberOfPartitions();
       CommandSetVector Batchable, NonBatchable;
       std::vector<const Job *> Batches;
       bool PretendTheCommandLineIsTooLongOnce =
diff --git a/lib/Driver/Driver.cpp b/lib/Driver/Driver.cpp
@@ -646,6 +646,22 @@ getDriverBatchCount(llvm::opt::InputArgList &ArgList,
   return None;
 }
 
+static Optional<unsigned>
+getDriverBatchSizeLimit(llvm::opt::InputArgList &ArgList,
+                        DiagnosticEngine &Diags)
+{
+  if (const Arg *A = ArgList.getLastArg(options::OPT_driver_batch_size_limit)) {
+    unsigned Limit = 0;
+    if (StringRef(A->getValue()).getAsInteger(10, Limit)) {
+      Diags.diagnose(SourceLoc(), diag::error_invalid_arg_value,
+                     A->getAsString(ArgList), A->getValue());
+    } else {
+      return Limit;
+    }
+  }
+  return None;
+}
+
 std::unique_ptr<Compilation>
 Driver::buildCompilation(const ToolChain &TC,
                          std::unique_ptr<llvm::opt::InputArgList> ArgList) {
@@ -671,6 +687,8 @@ Driver::buildCompilation(const ToolChain &TC,
     ArgList->hasArg(options::OPT_driver_show_job_lifecycle);
   unsigned DriverBatchSeed = getDriverBatchSeed(*ArgList, Diags);
   Optional<unsigned> DriverBatchCount = getDriverBatchCount(*ArgList, Diags);
+  Optional<unsigned> DriverBatchSizeLimit =
+    getDriverBatchSizeLimit(*ArgList, Diags);
   bool DriverForceOneBatchRepartition =
       ArgList->hasArg(options::OPT_driver_force_one_batch_repartition);
 
@@ -850,6 +868,7 @@ Driver::buildCompilation(const ToolChain &TC,
                       BatchMode,
                       DriverBatchSeed,
                       DriverBatchCount,
+                      DriverBatchSizeLimit,
                       DriverForceOneBatchRepartition,
                       DriverSkipExecution,
                       SaveTemps,
diff --git a/test/Driver/batch_mode_overlong_argv.swift b/test/Driver/batch_mode_overlong_argv.swift
@@ -103,7 +103,7 @@
 // RUN: touch  %t/f_100_1.swift %t/f_100_2.swift %t/f_100_3.swift %t/f_100_4.swift %t/f_100_5.swift %t/f_100_6.swift %t/f_100_7.swift %t/f_100_8.swift %t/f_100_9.swift %t/f_100_10.swift
 // RUN: mkdir -p %t/additional/path/elements/often/make/filenames/longer/than/one/might/expect/especially/given/output/directories/deep/within/a/derived/data/folder/of/a/CI/machine/
 // Force the repartitioning:
-// RUN: %swiftc_driver -driver-show-job-lifecycle -driver-force-one-batch-repartition -v -c -module-name foo -o %t/additional/path/elements/often/make/filenames/longer/than/one/might/expect/especially/given/output/directories/deep/within/a/derived/data/folder/of/a/CI/machine/foo.o -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out.txt 2>&1
+// RUN: %swiftc_driver -driver-show-job-lifecycle -driver-batch-size-limit 10000 -driver-force-one-batch-repartition -v -c -module-name foo -o %t/additional/path/elements/often/make/filenames/longer/than/one/might/expect/especially/given/output/directories/deep/within/a/derived/data/folder/of/a/CI/machine/foo.o -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out.txt 2>&1
 // RUN: %FileCheck %s <%t/out.txt
 // CHECK-NOT: unable to execute command
 // CHECK: Forming into 1 batches
@@ -113,7 +113,7 @@
 // CHECK: Forming batch job from 500 constituents
 //
 // Try it without the force; supplementary output file maps should obviate the repartition:
-// RUN: %swiftc_driver -driver-show-job-lifecycle -v -c -module-name foo -o %t/additional/path/elements/often/make/filenames/longer/than/one/might/expect/especially/given/output/directories/deep/within/a/derived/data/folder/of/a/CI/machine/foo.o -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out2.txt 2>&1
+// RUN: %swiftc_driver -driver-show-job-lifecycle -driver-batch-size-limit 10000 -v -c -module-name foo -o %t/additional/path/elements/often/make/filenames/longer/than/one/might/expect/especially/given/output/directories/deep/within/a/derived/data/folder/of/a/CI/machine/foo.o -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out2.txt 2>&1
 // RUN: %FileCheck %s <%t/out2.txt -check-prefix=NO-REPARTITION
 // CHECK-NOT: unable to execute command
 // NO-REPARTITION: Forming into 1 batches
diff --git a/validation-test/Driver/batch_mode_size_limit.swift b/validation-test/Driver/batch_mode_size_limit.swift
@@ -0,0 +1,34 @@
+// RUN: %empty-directory(%t)
+// RUN: touch  %t/f_1_1.swift %t/f_1_2.swift %t/f_1_3.swift %t/f_1_4.swift %t/f_1_5.swift %t/f_1_6.swift %t/f_1_7.swift %t/f_1_8.swift %t/f_1_9.swift %t/f_1_10.swift
+// RUN: touch  %t/f_2_1.swift %t/f_2_2.swift %t/f_2_3.swift %t/f_2_4.swift %t/f_2_5.swift %t/f_2_6.swift %t/f_2_7.swift %t/f_2_8.swift %t/f_2_9.swift %t/f_2_10.swift
+// RUN: touch  %t/f_3_1.swift %t/f_3_2.swift %t/f_3_3.swift %t/f_3_4.swift %t/f_3_5.swift %t/f_3_6.swift %t/f_3_7.swift %t/f_3_8.swift %t/f_3_9.swift %t/f_3_10.swift
+// RUN: touch  %t/f_4_1.swift %t/f_4_2.swift %t/f_4_3.swift %t/f_4_4.swift %t/f_4_5.swift %t/f_4_6.swift %t/f_4_7.swift %t/f_4_8.swift %t/f_4_9.swift %t/f_4_10.swift
+// RUN: touch  %t/f_5_1.swift %t/f_5_2.swift %t/f_5_3.swift %t/f_5_4.swift %t/f_5_5.swift %t/f_5_6.swift %t/f_5_7.swift %t/f_5_8.swift %t/f_5_9.swift %t/f_5_10.swift
+// RUN: touch  %t/f_6_1.swift %t/f_6_2.swift %t/f_6_3.swift %t/f_6_4.swift %t/f_6_5.swift %t/f_6_6.swift %t/f_6_7.swift %t/f_6_8.swift %t/f_6_9.swift %t/f_6_10.swift
+// RUN: touch  %t/f_7_1.swift %t/f_7_2.swift %t/f_7_3.swift %t/f_7_4.swift %t/f_7_5.swift %t/f_7_6.swift %t/f_7_7.swift %t/f_7_8.swift %t/f_7_9.swift %t/f_7_10.swift
+// RUN: touch  %t/f_8_1.swift %t/f_8_2.swift %t/f_8_3.swift %t/f_8_4.swift %t/f_8_5.swift %t/f_8_6.swift %t/f_8_7.swift %t/f_8_8.swift %t/f_8_9.swift %t/f_8_10.swift
+// RUN: touch  %t/f_9_1.swift %t/f_9_2.swift %t/f_9_3.swift %t/f_9_4.swift %t/f_9_5.swift %t/f_9_6.swift %t/f_9_7.swift %t/f_9_8.swift %t/f_9_9.swift %t/f_9_10.swift
+// RUN: touch  %t/f_10_1.swift %t/f_10_2.swift %t/f_10_3.swift %t/f_10_4.swift %t/f_10_5.swift %t/f_10_6.swift %t/f_10_7.swift %t/f_10_8.swift %t/f_10_9.swift %t/f_10_10.swift
+// RUN: %swiftc_driver -driver-show-job-lifecycle -v -c -module-name foo -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out.txt 2>&1
+// RUN: %FileCheck %s <%t/out.txt
+// CHECK-NOT: unable to execute command
+// CHECK: Forming into 4 batches
+// CHECK: Forming batch job from 25 constituents
+// CHECK: Forming batch job from 25 constituents
+// CHECK: Forming batch job from 25 constituents
+// CHECK: Forming batch job from 25 constituents
+//
+// RUN: %swiftc_driver -driver-show-job-lifecycle -driver-batch-size-limit 10 -v -c -module-name foo -emit-module -serialize-diagnostics -emit-dependencies -j 1 -enable-batch-mode %t/f_*.swift >%t/out.txt 2>&1
+// RUN: %FileCheck %s <%t/out.txt -check-prefix=EXPLICIT-ARG
+// EXPLICIT-ARG-NOT: unable to execute command
+// EXPLICIT-ARG: Forming into 10 batches
+// EXPLICIT-ARG: Forming batch job from 10 constituents
+// EXPLICIT-ARG: Forming batch job from 10 constituents
+// EXPLICIT-ARG: Forming batch job from 10 constituents
+// EXPLICIT-ARG: Forming batch job from 10 constituents
+// EXPLICIT-ARG: Forming batch job from 10 constituents
+// EXPLICIT-ARG: Forming batch job from 10 constituents
+// EXPLICIT-ARG: Forming batch job from 10 constituents
+// EXPLICIT-ARG: Forming batch job from 10 constituents
+// EXPLICIT-ARG: Forming batch job from 10 constituents
+// EXPLICIT-ARG: Forming batch job from 10 constituents