From 736725f950f4ff5dd6ea9a184e6dca0b8a306c2f Mon Sep 17 00:00:00 2001 From: Jan Kuipers Date: Fri, 23 May 2025 08:33:39 +0200 Subject: [PATCH 1/7] Include direct memory and non-heap memory in ML memory calculations. --- .../server/cli/MachineDependentHeap.java | 2 +- .../org/elasticsearch/monitor/jvm/JvmInfo.java | 17 ++++++++++------- .../elasticsearch/xpack/ml/MachineLearning.java | 3 ++- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java index 26fd7294ed557..c367d89e897d5 100644 --- a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java +++ b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java @@ -95,7 +95,7 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av * * If this formula is changed then corresponding changes must be made to the {@code NativeMemoryCalculator} and * {@code MlAutoscalingDeciderServiceTests} classes in the ML plugin code. Failure to keep the logic synchronized - * could result in repeated autoscaling up and down. + * could result in ML processes crashing with OOM errors or repeated autoscaling up and down. */ case ML_ONLY -> { if (availableMemory <= (GB * 16)) { diff --git a/server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java b/server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java index f827bb30c7e4a..a1faf24b512c4 100644 --- a/server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java +++ b/server/src/main/java/org/elasticsearch/monitor/jvm/JvmInfo.java @@ -43,14 +43,7 @@ public class JvmInfo implements ReportingService.Info { long nonHeapInit = memoryMXBean.getNonHeapMemoryUsage().getInit() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getInit(); long nonHeapMax = memoryMXBean.getNonHeapMemoryUsage().getMax() < 0 ? 0 : memoryMXBean.getNonHeapMemoryUsage().getMax(); long directMemoryMax = 0; - try { - Class vmClass = Class.forName("sun.misc.VM"); - directMemoryMax = (Long) vmClass.getMethod("maxDirectMemory").invoke(null); - } catch (Exception t) { - // ignore - } String[] inputArguments = runtimeMXBean.getInputArguments().toArray(new String[runtimeMXBean.getInputArguments().size()]); - Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax); String bootClassPath; try { @@ -130,6 +123,11 @@ public class JvmInfo implements ReportingService.Info { configuredMaxHeapSize = Long.parseLong((String) valueMethod.invoke(maxHeapSizeVmOptionObject)); } catch (Exception ignored) {} + try { + Object maxDirectMemorySizeVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "MaxDirectMemorySize"); + directMemoryMax = Long.parseLong((String) valueMethod.invoke(maxDirectMemorySizeVmOptionObject)); + } catch (Exception ignored) {} + try { Object useSerialGCVmOptionObject = vmOptionMethod.invoke(hotSpotDiagnosticMXBean, "UseSerialGC"); useSerialGC = (String) valueMethod.invoke(useSerialGCVmOptionObject); @@ -139,6 +137,8 @@ public class JvmInfo implements ReportingService.Info { } + Mem mem = new Mem(heapInit, heapMax, nonHeapInit, nonHeapMax, directMemoryMax); + INSTANCE = new JvmInfo( ProcessHandle.current().pid(), System.getProperty("java.version"), @@ -496,5 +496,8 @@ public ByteSizeValue getHeapMax() { return ByteSizeValue.ofBytes(heapMax); } + public ByteSizeValue getTotalMax() { + return ByteSizeValue.ofBytes(heapMax + nonHeapMax + directMemoryMax); + } } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java index 38113426f3fdb..4a81ca4c903e0 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java @@ -845,7 +845,8 @@ public Settings additionalSettings() { machineMemoryAttrName, Long.toString(OsProbe.getInstance().osStats().getMem().getAdjustedTotal().getBytes()) ); - addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(Runtime.getRuntime().maxMemory())); + + addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(JvmInfo.jvmInfo().getMem().getTotalMax().getBytes())); addMlNodeAttribute( additionalSettings, deprecatedAllocatedProcessorsAttrName, From 1eae27c5650d64c7bff39f56523d7e5ce3149ccf Mon Sep 17 00:00:00 2001 From: Jan Kuipers Date: Mon, 2 Jun 2025 12:13:57 +0200 Subject: [PATCH 2/7] Reduce ML_ONLY heap size, so that direct memory is accounted for. --- .../server/cli/MachineDependentHeap.java | 13 +++++++++++-- .../server/cli/MachineDependentHeapTests.java | 10 +++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java index c367d89e897d5..7d15ebcbc9886 100644 --- a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java +++ b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java @@ -98,10 +98,19 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av * could result in ML processes crashing with OOM errors or repeated autoscaling up and down. */ case ML_ONLY -> { + /* + * An ML node used to have 40% of the total memory for the Java heap, and the remainder + * for ML and overhead (the operating system). This did not account for the Java direct + * memory, which equals half of the heap size (see JvmErgonomics). + * Right now, a factor of 2/3 is applied to the heap size here, leaving the ML memory + * formula the same. That means the formula now also correctly accounts for direct memory, + * since the heap (2/3 * 40% of the memory) plus the direct memory (1/3 * 40% of the memory) + * equals the original 40% of the total memory. + */ if (availableMemory <= (GB * 16)) { - yield mb((long) (availableMemory * .4), 4); + yield mb((long) (availableMemory * .4 * 2/3), 4); } else { - yield mb((long) min((GB * 16) * .4 + (availableMemory - GB * 16) * .1, MAX_HEAP_SIZE), 4); + yield mb((long) min(((GB * 16) * .4 + (availableMemory - GB * 16) * .1) * 2/3, MAX_HEAP_SIZE), 4); } } /* diff --git a/distribution/tools/server-cli/src/test/java/org/elasticsearch/server/cli/MachineDependentHeapTests.java b/distribution/tools/server-cli/src/test/java/org/elasticsearch/server/cli/MachineDependentHeapTests.java index 64b46f1bca98f..12f455f242dc0 100644 --- a/distribution/tools/server-cli/src/test/java/org/elasticsearch/server/cli/MachineDependentHeapTests.java +++ b/distribution/tools/server-cli/src/test/java/org/elasticsearch/server/cli/MachineDependentHeapTests.java @@ -56,13 +56,13 @@ public void testMasterOnlyOptions() throws Exception { } public void testMlOnlyOptions() throws Exception { - assertHeapOptions(1, containsInAnyOrder("-Xmx408m", "-Xms408m"), "ml"); - assertHeapOptions(4, containsInAnyOrder("-Xmx1636m", "-Xms1636m"), "ml"); - assertHeapOptions(32, containsInAnyOrder("-Xmx8192m", "-Xms8192m"), "ml"); - assertHeapOptions(64, containsInAnyOrder("-Xmx11468m", "-Xms11468m"), "ml"); + assertHeapOptions(1, containsInAnyOrder("-Xmx272m", "-Xms272m"), "ml"); + assertHeapOptions(4, containsInAnyOrder("-Xmx1092m", "-Xms1092m"), "ml"); + assertHeapOptions(32, containsInAnyOrder("-Xmx5460m", "-Xms5460m"), "ml"); + assertHeapOptions(64, containsInAnyOrder("-Xmx7644m", "-Xms7644m"), "ml"); // We'd never see a node this big in Cloud, but this assertion proves that the 31GB absolute maximum // eventually kicks in (because 0.4 * 16 + 0.1 * (263 - 16) > 31) - assertHeapOptions(263, containsInAnyOrder("-Xmx31744m", "-Xms31744m"), "ml"); + assertHeapOptions(263, containsInAnyOrder("-Xmx21228m", "-Xms21228m"), "ml"); } public void testDataNodeOptions() throws Exception { From 56669f832fcc8973ddce7f0368c2ccd082fb9c0f Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 2 Jun 2025 12:49:04 +0000 Subject: [PATCH 3/7] [CI] Auto commit changes from spotless --- .../org/elasticsearch/server/cli/MachineDependentHeap.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java index 7d15ebcbc9886..9ccd5e62ed1f1 100644 --- a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java +++ b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java @@ -108,9 +108,9 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av * equals the original 40% of the total memory. */ if (availableMemory <= (GB * 16)) { - yield mb((long) (availableMemory * .4 * 2/3), 4); + yield mb((long) (availableMemory * .4 * 2 / 3), 4); } else { - yield mb((long) min(((GB * 16) * .4 + (availableMemory - GB * 16) * .1) * 2/3, MAX_HEAP_SIZE), 4); + yield mb((long) min(((GB * 16) * .4 + (availableMemory - GB * 16) * .1) * 2 / 3, MAX_HEAP_SIZE), 4); } } /* From 87e0e0a0efd2630f1c00d663ce4ade92f15fbaf0 Mon Sep 17 00:00:00 2001 From: Jan Kuipers Date: Tue, 3 Jun 2025 09:37:14 +0200 Subject: [PATCH 4/7] changelog --- docs/changelog/128742.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/128742.yaml diff --git a/docs/changelog/128742.yaml b/docs/changelog/128742.yaml new file mode 100644 index 0000000000000..ce974301f2dfc --- /dev/null +++ b/docs/changelog/128742.yaml @@ -0,0 +1,5 @@ +pr: 128742 +summary: "Account for Java direct memory on machine learning nodes to prevent out-of-memory crashes." +area: Machine Learning +type: bug +issues: [] From 36c9b2074f9440ad3f3a4b15281aaa3f4d4ec04d Mon Sep 17 00:00:00 2001 From: Jan Kuipers Date: Tue, 3 Jun 2025 09:50:53 +0200 Subject: [PATCH 5/7] improve docs --- .../server/cli/MachineDependentHeap.java | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java index 9ccd5e62ed1f1..11e445ecd7986 100644 --- a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java +++ b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java @@ -76,12 +76,16 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av /* * Machine learning only node. * - *

Heap is computed as: - *

    - *
  • 40% of total system memory when total system memory 16 gigabytes or less.
  • - *
  • 40% of the first 16 gigabytes plus 10% of memory above that when total system memory is more than 16 gigabytes.
  • - *
  • The absolute maximum heap size is 31 gigabytes.
  • - *
+ * The memory reserved for Java is computed as: + * - 40% of total system memory when total system memory 16 gigabytes or less. + * - 40% of the first 16 gigabytes plus 10% of memory above that when total system memory is more than 16 gigabytes. + * - The absolute maximum heap size is 31 gigabytes. + * + * This Java memory is divided as follows: + * - 2/3 of the Java memory is reserved for the Java heap. + * - 1/3 of the Java memory is reserved for the Java direct memory. + * + * The direct memory being half of the heap is set by the JvmErgonomics class. * * In all cases the result is rounded down to the next whole multiple of 4 megabytes. * The reason for doing this is that Java will round requested heap sizes to a multiple @@ -98,15 +102,6 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av * could result in ML processes crashing with OOM errors or repeated autoscaling up and down. */ case ML_ONLY -> { - /* - * An ML node used to have 40% of the total memory for the Java heap, and the remainder - * for ML and overhead (the operating system). This did not account for the Java direct - * memory, which equals half of the heap size (see JvmErgonomics). - * Right now, a factor of 2/3 is applied to the heap size here, leaving the ML memory - * formula the same. That means the formula now also correctly accounts for direct memory, - * since the heap (2/3 * 40% of the memory) plus the direct memory (1/3 * 40% of the memory) - * equals the original 40% of the total memory. - */ if (availableMemory <= (GB * 16)) { yield mb((long) (availableMemory * .4 * 2 / 3), 4); } else { From 6ef45ddf467809bc3e44c375af376b34d0dd97fa Mon Sep 17 00:00:00 2001 From: Jan Kuipers Date: Wed, 4 Jun 2025 16:38:41 +0200 Subject: [PATCH 6/7] Reuse direct memory to heap factor --- .../org/elasticsearch/server/cli/JvmErgonomics.java | 4 +++- .../server/cli/MachineDependentHeap.java | 11 ++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/JvmErgonomics.java b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/JvmErgonomics.java index 1160589e43966..1970dd82b8ebe 100644 --- a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/JvmErgonomics.java +++ b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/JvmErgonomics.java @@ -28,6 +28,8 @@ */ final class JvmErgonomics { + static final double DIRECT_MEMORY_TO_HEAP_FACTOR = 0.5; + private JvmErgonomics() { throw new AssertionError("No instances intended"); } @@ -44,7 +46,7 @@ static List choose(final List userDefinedJvmOptions, Settings no final long heapSize = JvmOption.extractMaxHeapSize(finalJvmOptions); final long maxDirectMemorySize = JvmOption.extractMaxDirectMemorySize(finalJvmOptions); if (maxDirectMemorySize == 0) { - ergonomicChoices.add("-XX:MaxDirectMemorySize=" + heapSize / 2); + ergonomicChoices.add("-XX:MaxDirectMemorySize=" + (long) (DIRECT_MEMORY_TO_HEAP_FACTOR * heapSize)); } final boolean tuneG1GCForSmallHeap = tuneG1GCForSmallHeap(heapSize); diff --git a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java index 11e445ecd7986..126c426869f39 100644 --- a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java +++ b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java @@ -102,10 +102,15 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av * could result in ML processes crashing with OOM errors or repeated autoscaling up and down. */ case ML_ONLY -> { - if (availableMemory <= (GB * 16)) { - yield mb((long) (availableMemory * .4 * 2 / 3), 4); + double heapFractionBelow16GB = 0.4 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR); + double heapFractionAbove16GB = 0.1 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR); + if (availableMemory <= GB * 16) { + yield mb((long) (availableMemory * heapFractionBelow16GB), 4); } else { - yield mb((long) min(((GB * 16) * .4 + (availableMemory - GB * 16) * .1) * 2 / 3, MAX_HEAP_SIZE), 4); + yield mb( + (long) min(GB * 16 * heapFractionBelow16GB + (availableMemory - GB * 16) * heapFractionAbove16GB, MAX_HEAP_SIZE), + 4 + ); } } /* From 2416b3de1ea68ede9ef9fdbb4c0931edd26ae5ea Mon Sep 17 00:00:00 2001 From: Jan Kuipers Date: Wed, 4 Jun 2025 16:50:00 +0200 Subject: [PATCH 7/7] feature flag --- .../server/cli/MachineDependentHeap.java | 11 +++++++++-- .../org/elasticsearch/xpack/ml/MachineLearning.java | 9 ++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java index 126c426869f39..b68e374bbdb94 100644 --- a/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java +++ b/distribution/tools/server-cli/src/main/java/org/elasticsearch/server/cli/MachineDependentHeap.java @@ -11,6 +11,7 @@ import org.elasticsearch.cluster.node.DiscoveryNodeRole; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.FeatureFlag; import org.elasticsearch.node.NodeRoleSettings; import java.io.IOException; @@ -37,6 +38,8 @@ public class MachineDependentHeap { protected static final long MAX_HEAP_SIZE = GB * 31; // 31GB protected static final long MIN_HEAP_SIZE = 1024 * 1024 * 128; // 128MB + private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation"); + public MachineDependentHeap() {} /** @@ -102,8 +105,12 @@ protected int getHeapSizeMb(Settings nodeSettings, MachineNodeRole role, long av * could result in ML processes crashing with OOM errors or repeated autoscaling up and down. */ case ML_ONLY -> { - double heapFractionBelow16GB = 0.4 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR); - double heapFractionAbove16GB = 0.1 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR); + double heapFractionBelow16GB = 0.4; + double heapFractionAbove16GB = 0.1; + if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) { + heapFractionBelow16GB = 0.4 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR); + heapFractionAbove16GB = 0.1 / (1.0 + JvmErgonomics.DIRECT_MEMORY_TO_HEAP_FACTOR); + } if (availableMemory <= GB * 16) { yield mb((long) (availableMemory * heapFractionBelow16GB), 4); } else { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java index 4a81ca4c903e0..434df0d1a07a0 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java @@ -41,6 +41,7 @@ import org.elasticsearch.common.settings.SettingsModule; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.unit.Processors; +import org.elasticsearch.common.util.FeatureFlag; import org.elasticsearch.common.util.concurrent.EsExecutors; import org.elasticsearch.core.TimeValue; import org.elasticsearch.env.Environment; @@ -557,6 +558,8 @@ public class MachineLearning extends Plugin License.OperationMode.PLATINUM ); + private static final FeatureFlag NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG = new FeatureFlag("new_ml_memory_computation"); + @Override public Map getProcessors(Processor.Parameters parameters) { if (this.enabled == false) { @@ -846,7 +849,11 @@ public Settings additionalSettings() { Long.toString(OsProbe.getInstance().osStats().getMem().getAdjustedTotal().getBytes()) ); - addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(JvmInfo.jvmInfo().getMem().getTotalMax().getBytes())); + long jvmSize = Runtime.getRuntime().maxMemory(); + if (NEW_ML_MEMORY_COMPUTATION_FEATURE_FLAG.isEnabled()) { + jvmSize = JvmInfo.jvmInfo().getMem().getTotalMax().getBytes(); + } + addMlNodeAttribute(additionalSettings, jvmSizeAttrName, Long.toString(jvmSize)); addMlNodeAttribute( additionalSettings, deprecatedAllocatedProcessorsAttrName,