colserde,rowcontainer: prohibit writing very large keys

yuzefovich · yuzefovich · commit 9ec9ffab0e84 · 2025-06-26T17:07:14.000-07:00
We just saw a node crash in a test when we wrote 2.5 GiB key to the
temporary storage used by the row container. Such large keys aren't well
supported by pebble and can lead to undefined behavior, so we add an
explicit check that the key doesn't exceed 1.5 GiB. We also now will
lose scratch slices once they exceed 1 MiB in size (we already have
memory accounting in place for them).

Similarly, the vectorized disk spilling could suffer from the same
problem since in the arrow format offsets are int32, so we if were to
serialize a vector of more than 2 GiB in size, we'd encounter undefined
behavior (which we've seen a couple of times in sentry issues). This
commit adds an explicit check there as well returning an error if the
serialized size exceeds max int32. Additionally, we now will lose
references to the large scratch slice that we keep across the calls once
it exceeds 32 MiB. Note that I initially added a simple unit test that
allocated a vector of 3 GiB size and ensured that an error is returned,
but it hits an OOM on EngFlow environment, and it doesn't seem worth
upgrading it to the heavy pool, so I removed it.

In the test failure such a large value was produced via `st_collect`
geo builtin. Another example I can think of would be an array created
via `array_agg` argument.

Release note: None
diff --git a/pkg/col/colserde/BUILD.bazel b/pkg/col/colserde/BUILD.bazel
@@ -23,6 +23,7 @@ go_library(
         "@com_github_apache_arrow_go_arrow//array",
         "@com_github_apache_arrow_go_arrow//memory",
         "@com_github_cockroachdb_errors//:errors",
+        "@com_github_dustin_go_humanize//:go-humanize",
         "@com_github_edsrzf_mmap_go//:mmap-go",
         "@com_github_google_flatbuffers//go",
     ],
diff --git a/pkg/col/colserde/arrowbatchconverter.go b/pkg/col/colserde/arrowbatchconverter.go
@@ -9,6 +9,7 @@ import (
 	"context"
 	"encoding/binary"
 	"fmt"
+	"math"
 	"reflect"
 	"unsafe"
 
@@ -17,11 +18,14 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/col/coldata"
 	"github.com/cockroachdb/cockroach/pkg/col/typeconv"
 	"github.com/cockroachdb/cockroach/pkg/sql/memsize"
+	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
+	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
 	"github.com/cockroachdb/cockroach/pkg/sql/types"
 	"github.com/cockroachdb/cockroach/pkg/util/duration"
 	"github.com/cockroachdb/cockroach/pkg/util/mon"
 	"github.com/cockroachdb/errors"
+	"github.com/dustin/go-humanize"
 )
 
 // ConversionMode describes how ArrowBatchConverter will be utilized.
@@ -307,8 +311,27 @@ func (c *ArrowBatchConverter) BatchToArrow(
 				panic(fmt.Sprintf("unsupported type for conversion to arrow data %s", typ))
 			}
 
+			// If the serialized representation is larger than int32 range, then
+			// we'll fail to properly deserialize it (offsets will effectively
+			// contain corrupted information), so we return an early error
+			// instead.
+			if len(values) > math.MaxInt32 {
+				// Return an error with a pgcode so that it's not considered
+				// "internal" by the vectorized panic catcher.
+				return nil, pgerror.Newf(
+					pgcode.OutOfMemory, "serialized representation of %s column is too large: %s",
+					typ, humanize.IBytes(uint64(len(values))),
+				)
+			}
+
 			// Store the serialized slices as scratch space for the next call.
-			c.scratch.values[vecIdx] = values
+			// The only exception is when the values slice becomes too large, in
+			// which case we keep the scratch space unchanged (meaning that
+			// we'll keep the space from the previous call, if any).
+			const maxKeptSize = 32 << 20 /* 32 MiB */
+			if cap(values) <= maxKeptSize {
+				c.scratch.values[vecIdx] = values
+			}
 			c.scratch.offsets[vecIdx] = offsets
 		}
 
diff --git a/pkg/sql/rowcontainer/disk_row_container.go b/pkg/sql/rowcontainer/disk_row_container.go
@@ -186,6 +186,30 @@ func (d *DiskRowContainer) Len() int {
 	return int(d.rowID)
 }
 
+// Writing extremely large keys to pebble can lead to undefined behavior
+// (overflows and / or OOMs), so we'll prohibit keys larger than 1.5 GiB.
+const maxPebbleKeySize = 1536 << 20 /* 1.5 GiB */
+
+var maxPebbleKeySizeExceededErr = pgerror.Newf(pgcode.OutOfMemory, "temporary storage doesn't support keys larger than 1.5 GiB")
+
+// resetScratch prepares the scratch space for reuse. If the slice is too large
+// to keep, it's lost and the memory account is updated accordingly.
+func (d *DiskRowContainer) resetScratch(ctx context.Context) {
+	// Do not keep very large scratch space across rows (we're trying to
+	// minimize RAM usage after all since we've spilled to disk).
+	const maxKeptSize = 1 << 20 /* 1 MiB */
+	if cap(d.scratchKey) > maxKeptSize {
+		d.memAcc.Shrink(ctx, int64(cap(d.scratchKey)))
+		d.scratchKey = nil
+	}
+	if cap(d.scratchVal) > maxKeptSize {
+		d.memAcc.Shrink(ctx, int64(cap(d.scratchVal)))
+		d.scratchVal = nil
+	}
+	d.scratchKey = d.scratchKey[:0]
+	d.scratchVal = d.scratchVal[:0]
+}
+
 // AddRow is part of the SortableRowContainer interface.
 //
 // It is additionally used in de-duping mode by DiskBackedRowContainer when
@@ -200,7 +224,13 @@ func (d *DiskRowContainer) AddRow(ctx context.Context, row rowenc.EncDatumRow) e
 	if err := d.encodeRow(ctx, row); err != nil {
 		return err
 	}
+	defer d.resetScratch(ctx)
+	if len(d.scratchKey) > maxPebbleKeySize {
+		return maxPebbleKeySizeExceededErr
+	}
 	if err := d.diskAcc.Grow(ctx, int64(len(d.scratchKey)+len(d.scratchVal))); err != nil {
+		// TODO(yuzefovich): this error wrapping is redundant - err should be
+		// produced by the disk monitor.
 		return pgerror.Wrapf(err, pgcode.OutOfMemory,
 			"this query requires additional disk space")
 	}
@@ -222,8 +252,6 @@ func (d *DiskRowContainer) AddRow(ctx context.Context, row rowenc.EncDatumRow) e
 		}
 	}
 	d.totalEncodedRowBytes += uint64(len(d.scratchKey) + len(d.scratchVal))
-	d.scratchKey = d.scratchKey[:0]
-	d.scratchVal = d.scratchVal[:0]
 	d.rowID++
 	return nil
 }
@@ -235,10 +263,7 @@ func (d *DiskRowContainer) AddRowWithDeDup(
 	if err := d.encodeRow(ctx, row); err != nil {
 		return 0, err
 	}
-	defer func() {
-		d.scratchKey = d.scratchKey[:0]
-		d.scratchVal = d.scratchVal[:0]
-	}()
+	defer d.resetScratch(ctx)
 	// First use the cache to de-dup.
 	entry, ok := d.deDupCache[string(d.scratchKey)]
 	if ok {
@@ -269,7 +294,12 @@ func (d *DiskRowContainer) AddRowWithDeDup(
 		}
 		return int(idx), nil
 	}
+	if len(d.scratchKey) > maxPebbleKeySize {
+		return 0, maxPebbleKeySizeExceededErr
+	}
 	if err := d.diskAcc.Grow(ctx, int64(len(d.scratchKey)+len(d.scratchVal))); err != nil {
+		// TODO(yuzefovich): this error wrapping is redundant - err should be
+		// produced by the disk monitor.
 		return 0, pgerror.Wrapf(err, pgcode.OutOfMemory,
 			"this query requires additional disk space")
 	}
@@ -306,6 +336,8 @@ func (d *DiskRowContainer) testingFlushBuffer(ctx context.Context) {
 	d.clearDeDupCache(ctx)
 }
 
+// encodeRow encodes the given row into scratchKey and scratchVal fields. The
+// memory account is updated according to the new capacity of these slices.
 func (d *DiskRowContainer) encodeRow(ctx context.Context, row rowenc.EncDatumRow) (retErr error) {
 	if len(row) != len(d.types) {
 		log.Fatalf(ctx, "invalid row length %d, expected %d", len(row), len(d.types))