prestodb
diff --git a/‎presto-spark-base/src/main/java/com/facebook/presto/spark/execution/nativeprocess/NativeExecutionProcess.java‎
Lines changed: 103 additions & 10 deletions b/‎presto-spark-base/src/main/java/com/facebook/presto/spark/execution/nativeprocess/NativeExecutionProcess.java‎
Lines changed: 103 additions & 10 deletions
@@ -15,6 +15,7 @@
 
 import com.facebook.airlift.json.JsonCodec;
 import com.facebook.airlift.log.Logger;
+import com.facebook.airlift.units.DataSize;
 import com.facebook.airlift.units.Duration;
 import com.facebook.presto.Session;
 import com.facebook.presto.client.ServerInfo;
@@ -23,6 +24,7 @@
 import com.facebook.presto.spark.execution.http.server.RequestErrorTracker;
 import com.facebook.presto.spark.execution.http.server.smile.BaseResponse;
 import com.facebook.presto.spark.execution.property.NativeExecutionSystemConfig;
+import com.facebook.presto.spark.execution.property.PrestoSparkWorkerProperty;
 import com.facebook.presto.spark.execution.property.WorkerProperty;
 import com.facebook.presto.spi.PrestoException;
 import com.google.common.annotations.VisibleForTesting;
@@ -32,6 +34,7 @@
 import com.google.common.util.concurrent.SettableFuture;
 import okhttp3.HttpUrl;
 import okhttp3.OkHttpClient;
+import org.apache.spark.SparkConf;
 import org.apache.spark.SparkEnv$;
 import org.apache.spark.SparkFiles;
 
@@ -61,11 +64,12 @@
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 
+import static com.facebook.airlift.units.DataSize.Unit.BYTE;
+import static com.facebook.airlift.units.DataSize.Unit.GIGABYTE;
 import static com.facebook.presto.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR;
 import static com.facebook.presto.spi.StandardErrorCode.NATIVE_EXECUTION_BINARY_NOT_EXIST;
 import static com.facebook.presto.spi.StandardErrorCode.NATIVE_EXECUTION_PROCESS_LAUNCH_ERROR;
 import static com.facebook.presto.spi.StandardErrorCode.NATIVE_EXECUTION_TASK_ERROR;
-import static com.google.common.base.Preconditions.checkArgument;
 import static com.google.common.util.concurrent.Futures.addCallback;
 import static com.google.common.util.concurrent.MoreExecutors.directExecutor;
 import static java.lang.String.format;
@@ -82,6 +86,7 @@ public class NativeExecutionProcess
     private static final String WORKER_CONFIG_FILE = "/config.properties";
     private static final String WORKER_NODE_CONFIG_FILE = "/node.properties";
     private static final String WORKER_CONNECTOR_CONFIG_FILE = "/catalog/";
+    private static final String NATIVE_PROCESS_MEMORY_SPARK_CONF_NAME = "spark.memory.offHeap.size";
     private static final int SIGSYS = 31;
 
     private final String executablePath;
@@ -131,6 +136,8 @@ public NativeExecutionProcess(
                 scheduledExecutorService,
                 "getting native process status");
         this.workerProperty = requireNonNull(workerProperty, "workerProperty is null");
+        // Update any runtime configs to be used by presto native worker
+        updateWorkerProperties();
     }
 
     /**
@@ -325,25 +332,111 @@ private static int getAvailableTcpPort(String nodeInternalAddress)
         }
     }
 
-    private String getNativeExecutionCatalogName(Session session)
+    private void populateConfigurationFiles(String configBasePath)
+            throws IOException
     {
-        checkArgument(session.getCatalog().isPresent(), "Catalog isn't set in the session.");
-        return session.getCatalog().get();
+        workerProperty.populateAllProperties(
+                Paths.get(configBasePath, WORKER_CONFIG_FILE),
+                Paths.get(configBasePath, WORKER_NODE_CONFIG_FILE),
+                Paths.get(configBasePath, WORKER_CONNECTOR_CONFIG_FILE));  // Directory path for catalogs
     }
 
-    private void populateConfigurationFiles(String configBasePath)
-            throws IOException
+    private void updateWorkerProperties()
     {
+        // Update memory properties
+        updateWorkerMemoryProperties();
+
         // The reason we have to pick and assign the port per worker is in our prod environment,
         // there is no port isolation among all the containers running on the same host, so we have
         // to pick unique port per worker to avoid port collision. This config will be passed down to
         // the native execution process eventually for process initialization.
         workerProperty.getSystemConfig()
                 .update(NativeExecutionSystemConfig.HTTP_SERVER_HTTP_PORT, String.valueOf(port));
-        workerProperty.populateAllProperties(
-                Paths.get(configBasePath, WORKER_CONFIG_FILE),
-                Paths.get(configBasePath, WORKER_NODE_CONFIG_FILE),
-                Paths.get(configBasePath, WORKER_CONNECTOR_CONFIG_FILE));  // Directory path for catalogs
+    }
+
+    protected SparkConf getSparkConf()
+    {
+        return SparkEnv$.MODULE$.get() == null ? null : SparkEnv$.MODULE$.get().conf();
+    }
+
+    protected PrestoSparkWorkerProperty getWorkerProperty()
+    {
+        return (PrestoSparkWorkerProperty) workerProperty;
+    }
+
+    /**
+     * Computes values for system-memory-gb and query-memory-gb to start the native worker
+     * with.
+     * This logic is mainly useful when spark has provisioned larger containers to run
+     * previously OOMing tasks. Spark will provision larger container but without below
+     * logic the cpp process will not be able to use it.
+     *
+     * Also, we write the logic in a way that same logic applies during first attempt v/s
+     * subsequent OOMed larger container retry attempts
+     *
+     * The logic is simple and is as below
+     * - New system-memory-gb = spark.memory.offHeap.size
+     * - Then to calculate the new value of query-memory-gb we assume that
+     *   the new query-memory to system-memory ratio should be same as old values.
+     *   So we set newQueryMemory = newSystemMemory = (oldQueryMemory/oldSystemMemory)
+     *
+     *   TODO: In future make this algorithm more configurable. i.e. we might want a min/max
+     *         cap on the systemMemoryGb-queryMemoryGb buffer. Currently we just assume ratio
+     *         is good enough
+     */
+    protected void updateWorkerMemoryProperties()
+    {
+        // If sparkConf.NATIVE_PROCESS_MEMORY_SPARK_CONF_NAME is not set
+        // skip making any updates
+        SparkConf conf = getSparkConf();
+        if (conf == null) {
+            log.info("Not adjusting native process memory as conf is null");
+            return;
+        }
+        if (!conf.contains(NATIVE_PROCESS_MEMORY_SPARK_CONF_NAME)) {
+            log.info("Not adjusting native process memory as %s is not set", NATIVE_PROCESS_MEMORY_SPARK_CONF_NAME);
+            return;
+        }
+        DataSize offHeapMemoryBytes = DataSize.succinctDataSize(
+                conf.getSizeAsBytes(NATIVE_PROCESS_MEMORY_SPARK_CONF_NAME), BYTE);
+        DataSize currentSystemMemory = DataSize.valueOf(workerProperty.getSystemConfig().getAllProperties()
+                .get(NativeExecutionSystemConfig.SYSTEM_MEMORY_GB) + GIGABYTE.getUnitString());
+        DataSize currentQueryMemory = DataSize.valueOf(workerProperty.getSystemConfig().getAllProperties()
+                .get(NativeExecutionSystemConfig.QUERY_MEMORY_GB) + GIGABYTE.getUnitString());
+        if (offHeapMemoryBytes.toBytes() == 0
+                || currentSystemMemory.toBytes() == 0
+                || offHeapMemoryBytes.toBytes() < currentSystemMemory.toBytes()) {
+            log.info("Not adjusting native process memory as" +
+                    " offHeapMemoryBytes=%s,currentSystemMemory=%s are invalid", offHeapMemoryBytes, currentSystemMemory.toBytes());
+            return;
+        }
+
+        log.info("Setting Native Worker system-memory-gb to offHeap: %s", offHeapMemoryBytes);
+        DataSize newSystemMemory = offHeapMemoryBytes.convertTo(GIGABYTE);
+
+        double queryMemoryFraction = currentQueryMemory.toBytes() * 1.0 / currentSystemMemory.toBytes();
+        DataSize newQueryMemoryBytes = DataSize.succinctDataSize(
+                queryMemoryFraction * newSystemMemory.toBytes(), BYTE);
+        log.info("Dynamically Tuning Presto Native Memory Configs. " +
+                        "Configured SparkOffHeap: %s; " +
+                        "[oldSystemMemory: %s, newSystemMemory: %s], queryMemoryFraction: %s, " +
+                        "[oldQueryMemory: %s, newQueryMemory: %s]",
+                offHeapMemoryBytes,
+                currentSystemMemory,
+                newSystemMemory,
+                queryMemoryFraction,
+                currentQueryMemory,
+                newQueryMemoryBytes);
+
+        workerProperty.getSystemConfig()
+                .update(NativeExecutionSystemConfig.SYSTEM_MEMORY_GB,
+                        String.valueOf((int) newSystemMemory.getValue(GIGABYTE)));
+        workerProperty.getSystemConfig()
+                .update(NativeExecutionSystemConfig.QUERY_MEMORY_GB,
+                        String.valueOf((int) newQueryMemoryBytes.getValue(GIGABYTE)));
+        workerProperty.getSystemConfig()
+                .update(NativeExecutionSystemConfig.QUERY_MAX_MEMORY_PER_NODE,
+                        newQueryMemoryBytes.convertTo(GIGABYTE).toString());
     }
 
     private void doGetServerInfo(SettableFuture<ServerInfo> future)