Per-partition memory pressure threshold (#47)

jghoman · web-flow · commit e61e884cbd85 · 2026-03-17T14:29:15.000-07:00
* Move to per-partition memory pressure thresholds, with a default setting of none.

* Rename config to be more descriptive.

* Format fix.

* Addressing PR feedback.
diff --git a/README.md b/README.md
@@ -216,6 +216,76 @@ This will partition the table by:
 - Month extracted from the integer timestamp  
 - Event type as a string column
 
+## Memory Sizing
+
+The connector uses two distinct memory regions:
+
+| Region | Controlled by | Contents |
+|--------|--------------|----------|
+| JVM heap | `-Xmx` | Kafka Connect framework, record deserialization, connector bookkeeping |
+| Off-heap (direct) | Arrow `RootAllocator` | Arrow columnar buffers (the bulk of memory under load) |
+
+Arrow off-heap memory is **not** subject to `-Xmx`. It is bounded by the OS/container memory limit, which means an under-sized pod will OOMKill before the JVM notices pressure.
+
+### Per-Partition Memory Pressure
+
+The `memory.pressure.per.partition.bytes` setting defines a per-partition threshold for Arrow buffer accumulation. When a single partition's buffered data exceeds this value, that partition is flushed immediately, independent of the normal `flush.size` / `file.size.bytes` / `flush.interval.ms` thresholds.
+
+Setting it to `0` (the default) disables the check entirely.
+
+### Sizing Formula
+
+Given:
+- **T** = maximum number of sink tasks that can be assigned to a single worker/pod
+  - In distributed mode, this is ≤ connector-wide `tasks.max` and depends on how many workers you run and how tasks are balanced.
+- **P** = max partitions assigned to a single task
+- **M** = `memory.pressure.per.partition.bytes` (per-partition threshold)
+- **H** = JVM heap (`-Xmx`)
+
+The worst-case Arrow off-heap usage **per pod** is:
+
+```
+arrow_max = T × P × M
+```
+This formula is per pod/worker and uses **T** as "tasks that can co-reside on a single worker":
+ - Worst-case placement (all tasks on one worker): `T = tasks.max`.
+ - With `W` workers and roughly even distribution: `T ≈ ceil(tasks.max / W)`.
+
+
+Each partition can buffer up to `M` bytes before being force-flushed. The container memory limit must cover both heap and off-heap:
+
+```
+container_memory ≥ H + arrow_max + overhead
+```
+
+Where `overhead` accounts for JVM metaspace, thread stacks, OS buffers, etc. — typically 512MB–1GB.
+
+### Example
+
+| Parameter | Value |
+|-----------|-------|
+| `tasks.max` | 3 |
+| Partitions per task | 8 |
+| `memory.pressure.per.partition.bytes` | 536870912 (512MB) |
+| `-Xmx` | 2g |
+
+```
+arrow_max = 3 × 8 × 512MB = 12GB
+container  ≥ 2GB + 12GB + 1GB = 15GB
+```
+
+If 15GB is too large, reduce the per-partition threshold or reduce `tasks.max`:
+
+```
+# 2 tasks, 256MB threshold:
+arrow_max = 2 × 8 × 256MB = 4GB
+container  ≥ 2GB + 4GB + 1GB = 7GB
+```
+
+### Disk Spill as an Alternative
+
+If off-heap memory is constrained, enable `spill.enabled=true` instead. This writes Arrow batches to local disk immediately after conversion, reducing per-task Arrow memory to a few MB at the cost of additional disk I/O. When spilling is enabled, `memory.pressure.per.partition.bytes` has no effect — there is no in-memory accumulation to threshold against.
+
 ## Example Connector Config (Kafka Connect REST)
 
 ```json
diff --git a/src/main/java/com/inyo/ducklake/connect/DucklakeSinkConfig.java b/src/main/java/com/inyo/ducklake/connect/DucklakeSinkConfig.java
@@ -61,6 +61,10 @@ public class DucklakeSinkConfig extends AbstractConfig {
   // DuckLake retry configuration for handling PostgreSQL serialization conflicts
   public static final String DUCKLAKE_MAX_RETRY_COUNT = "ducklake.max_retry_count";
 
+  // Per-partition memory pressure threshold
+  public static final String MEMORY_PRESSURE_PER_PARTITION_BYTES =
+      "memory.pressure.per.partition.bytes";
+
   // Disk spill configuration for reducing memory pressure
   public static final String SPILL_ENABLED = "spill.enabled";
   public static final String SPILL_DIRECTORY = "spill.directory";
@@ -160,6 +164,15 @@ private static ConfigDef newConfigDef() {
             true,
             ConfigDef.Importance.MEDIUM,
             "Enable parallel flushing of partitions for higher throughput. Default: true")
+        .define(
+            MEMORY_PRESSURE_PER_PARTITION_BYTES,
+            ConfigDef.Type.LONG,
+            0L,
+            ConfigDef.Range.atLeast(0L),
+            ConfigDef.Importance.MEDIUM,
+            "Per-partition memory pressure threshold in bytes. When a partition's buffered data "
+                + "exceeds this value, it is flushed immediately. 0 disables memory pressure checks. "
+                + "Default: 0 (disabled)")
         .define(
             DUCKLAKE_MAX_RETRY_COUNT,
             ConfigDef.Type.INT,
@@ -259,6 +272,15 @@ public int getDuckDbThreads() {
     return threads;
   }
 
+  /**
+   * Returns the memory pressure threshold in bytes for each partition.
+   *
+   * @return memory pressure threshold in bytes per partition
+   */
+  public long getMemoryPressurePerPartitionBytes() {
+    return getLong(MEMORY_PRESSURE_PER_PARTITION_BYTES);
+  }
+
   /**
    * Returns whether parallel partition flushing is enabled.
    *
diff --git a/src/main/java/com/inyo/ducklake/connect/DucklakeSinkTask.java b/src/main/java/com/inyo/ducklake/connect/DucklakeSinkTask.java
@@ -66,6 +66,7 @@ public class DucklakeSinkTask extends SinkTask {
   private ScheduledExecutorService flushScheduler;
   private ExecutorService partitionExecutor;
   private boolean parallelPartitionFlush;
+  private long memoryPressureBytes;
 
   // Spill configuration
   private boolean spillEnabled;
@@ -161,6 +162,7 @@ public void start(Map<String, String> map) {
     this.flushIntervalMs = config.getFlushIntervalMs();
     this.fileSizeBytes = config.getFileSizeBytes();
     this.parallelPartitionFlush = config.isParallelPartitionFlushEnabled();
+    this.memoryPressureBytes = config.getMemoryPressurePerPartitionBytes();
 
     // Initialize spill configuration
     this.spillEnabled = config.isSpillEnabled();
@@ -191,12 +193,13 @@ public void start(Map<String, String> map) {
     int threadCount = config.getDuckDbThreads();
     LOG.info(
         "Buffering config: flushSize={}, flushIntervalMs={}, fileSizeBytes={}, "
-            + "parallelPartitionFlush={}, duckdbThreads={}, spillEnabled={}",
+            + "parallelPartitionFlush={}, duckdbThreads={}, memoryPressureBytes={}, spillEnabled={}",
         flushSize,
         flushIntervalMs,
         fileSizeBytes,
         parallelPartitionFlush,
         threadCount,
+        memoryPressureBytes,
         spillEnabled);
 
     // Create executor for parallel partition processing
@@ -687,16 +690,15 @@ private FlushData checkAndExtractFlushData(TopicPartition partition) {
       return null;
     }
 
-    // Check for global memory pressure from Arrow allocator
-    long allocatedMemory = allocator.getAllocatedMemory();
-    boolean memoryPressure = allocatedMemory > fileSizeBytes * buffers.size();
+    // Check for per-partition memory pressure (0 = disabled)
+    boolean memoryPressure = memoryPressureBytes > 0 && buffer.estimatedBytes > memoryPressureBytes;
 
     if (memoryPressure) {
       LOG.warn(
-          "Memory pressure detected for partition {}: allocatorBytes={}, threshold={}",
+          "Memory pressure detected for partition {}: bufferBytes={}, threshold={}",
           partition,
-          allocatedMemory,
-          fileSizeBytes * buffers.size());
+          buffer.estimatedBytes,
+          memoryPressureBytes);
     }
 
     // Flush if normal thresholds exceeded OR if under memory pressure with data buffered