Skip to content

Commit 9ec9ffa

Browse files
committed
colserde,rowcontainer: prohibit writing very large keys
We just saw a node crash in a test when we wrote 2.5 GiB key to the temporary storage used by the row container. Such large keys aren't well supported by pebble and can lead to undefined behavior, so we add an explicit check that the key doesn't exceed 1.5 GiB. We also now will lose scratch slices once they exceed 1 MiB in size (we already have memory accounting in place for them). Similarly, the vectorized disk spilling could suffer from the same problem since in the arrow format offsets are int32, so we if were to serialize a vector of more than 2 GiB in size, we'd encounter undefined behavior (which we've seen a couple of times in sentry issues). This commit adds an explicit check there as well returning an error if the serialized size exceeds max int32. Additionally, we now will lose references to the large scratch slice that we keep across the calls once it exceeds 32 MiB. Note that I initially added a simple unit test that allocated a vector of 3 GiB size and ensured that an error is returned, but it hits an OOM on EngFlow environment, and it doesn't seem worth upgrading it to the heavy pool, so I removed it. In the test failure such a large value was produced via `st_collect` geo builtin. Another example I can think of would be an array created via `array_agg` argument. Release note: None
1 parent d164968 commit 9ec9ffa

File tree

3 files changed

+63
-7
lines changed

3 files changed

+63
-7
lines changed

pkg/col/colserde/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ go_library(
2323
"@com_github_apache_arrow_go_arrow//array",
2424
"@com_github_apache_arrow_go_arrow//memory",
2525
"@com_github_cockroachdb_errors//:errors",
26+
"@com_github_dustin_go_humanize//:go-humanize",
2627
"@com_github_edsrzf_mmap_go//:mmap-go",
2728
"@com_github_google_flatbuffers//go",
2829
],

pkg/col/colserde/arrowbatchconverter.go

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"context"
1010
"encoding/binary"
1111
"fmt"
12+
"math"
1213
"reflect"
1314
"unsafe"
1415

@@ -17,11 +18,14 @@ import (
1718
"github.com/cockroachdb/cockroach/pkg/col/coldata"
1819
"github.com/cockroachdb/cockroach/pkg/col/typeconv"
1920
"github.com/cockroachdb/cockroach/pkg/sql/memsize"
21+
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
22+
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
2023
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
2124
"github.com/cockroachdb/cockroach/pkg/sql/types"
2225
"github.com/cockroachdb/cockroach/pkg/util/duration"
2326
"github.com/cockroachdb/cockroach/pkg/util/mon"
2427
"github.com/cockroachdb/errors"
28+
"github.com/dustin/go-humanize"
2529
)
2630

2731
// ConversionMode describes how ArrowBatchConverter will be utilized.
@@ -307,8 +311,27 @@ func (c *ArrowBatchConverter) BatchToArrow(
307311
panic(fmt.Sprintf("unsupported type for conversion to arrow data %s", typ))
308312
}
309313

314+
// If the serialized representation is larger than int32 range, then
315+
// we'll fail to properly deserialize it (offsets will effectively
316+
// contain corrupted information), so we return an early error
317+
// instead.
318+
if len(values) > math.MaxInt32 {
319+
// Return an error with a pgcode so that it's not considered
320+
// "internal" by the vectorized panic catcher.
321+
return nil, pgerror.Newf(
322+
pgcode.OutOfMemory, "serialized representation of %s column is too large: %s",
323+
typ, humanize.IBytes(uint64(len(values))),
324+
)
325+
}
326+
310327
// Store the serialized slices as scratch space for the next call.
311-
c.scratch.values[vecIdx] = values
328+
// The only exception is when the values slice becomes too large, in
329+
// which case we keep the scratch space unchanged (meaning that
330+
// we'll keep the space from the previous call, if any).
331+
const maxKeptSize = 32 << 20 /* 32 MiB */
332+
if cap(values) <= maxKeptSize {
333+
c.scratch.values[vecIdx] = values
334+
}
312335
c.scratch.offsets[vecIdx] = offsets
313336
}
314337

pkg/sql/rowcontainer/disk_row_container.go

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,30 @@ func (d *DiskRowContainer) Len() int {
186186
return int(d.rowID)
187187
}
188188

189+
// Writing extremely large keys to pebble can lead to undefined behavior
190+
// (overflows and / or OOMs), so we'll prohibit keys larger than 1.5 GiB.
191+
const maxPebbleKeySize = 1536 << 20 /* 1.5 GiB */
192+
193+
var maxPebbleKeySizeExceededErr = pgerror.Newf(pgcode.OutOfMemory, "temporary storage doesn't support keys larger than 1.5 GiB")
194+
195+
// resetScratch prepares the scratch space for reuse. If the slice is too large
196+
// to keep, it's lost and the memory account is updated accordingly.
197+
func (d *DiskRowContainer) resetScratch(ctx context.Context) {
198+
// Do not keep very large scratch space across rows (we're trying to
199+
// minimize RAM usage after all since we've spilled to disk).
200+
const maxKeptSize = 1 << 20 /* 1 MiB */
201+
if cap(d.scratchKey) > maxKeptSize {
202+
d.memAcc.Shrink(ctx, int64(cap(d.scratchKey)))
203+
d.scratchKey = nil
204+
}
205+
if cap(d.scratchVal) > maxKeptSize {
206+
d.memAcc.Shrink(ctx, int64(cap(d.scratchVal)))
207+
d.scratchVal = nil
208+
}
209+
d.scratchKey = d.scratchKey[:0]
210+
d.scratchVal = d.scratchVal[:0]
211+
}
212+
189213
// AddRow is part of the SortableRowContainer interface.
190214
//
191215
// It is additionally used in de-duping mode by DiskBackedRowContainer when
@@ -200,7 +224,13 @@ func (d *DiskRowContainer) AddRow(ctx context.Context, row rowenc.EncDatumRow) e
200224
if err := d.encodeRow(ctx, row); err != nil {
201225
return err
202226
}
227+
defer d.resetScratch(ctx)
228+
if len(d.scratchKey) > maxPebbleKeySize {
229+
return maxPebbleKeySizeExceededErr
230+
}
203231
if err := d.diskAcc.Grow(ctx, int64(len(d.scratchKey)+len(d.scratchVal))); err != nil {
232+
// TODO(yuzefovich): this error wrapping is redundant - err should be
233+
// produced by the disk monitor.
204234
return pgerror.Wrapf(err, pgcode.OutOfMemory,
205235
"this query requires additional disk space")
206236
}
@@ -222,8 +252,6 @@ func (d *DiskRowContainer) AddRow(ctx context.Context, row rowenc.EncDatumRow) e
222252
}
223253
}
224254
d.totalEncodedRowBytes += uint64(len(d.scratchKey) + len(d.scratchVal))
225-
d.scratchKey = d.scratchKey[:0]
226-
d.scratchVal = d.scratchVal[:0]
227255
d.rowID++
228256
return nil
229257
}
@@ -235,10 +263,7 @@ func (d *DiskRowContainer) AddRowWithDeDup(
235263
if err := d.encodeRow(ctx, row); err != nil {
236264
return 0, err
237265
}
238-
defer func() {
239-
d.scratchKey = d.scratchKey[:0]
240-
d.scratchVal = d.scratchVal[:0]
241-
}()
266+
defer d.resetScratch(ctx)
242267
// First use the cache to de-dup.
243268
entry, ok := d.deDupCache[string(d.scratchKey)]
244269
if ok {
@@ -269,7 +294,12 @@ func (d *DiskRowContainer) AddRowWithDeDup(
269294
}
270295
return int(idx), nil
271296
}
297+
if len(d.scratchKey) > maxPebbleKeySize {
298+
return 0, maxPebbleKeySizeExceededErr
299+
}
272300
if err := d.diskAcc.Grow(ctx, int64(len(d.scratchKey)+len(d.scratchVal))); err != nil {
301+
// TODO(yuzefovich): this error wrapping is redundant - err should be
302+
// produced by the disk monitor.
273303
return 0, pgerror.Wrapf(err, pgcode.OutOfMemory,
274304
"this query requires additional disk space")
275305
}
@@ -306,6 +336,8 @@ func (d *DiskRowContainer) testingFlushBuffer(ctx context.Context) {
306336
d.clearDeDupCache(ctx)
307337
}
308338

339+
// encodeRow encodes the given row into scratchKey and scratchVal fields. The
340+
// memory account is updated according to the new capacity of these slices.
309341
func (d *DiskRowContainer) encodeRow(ctx context.Context, row rowenc.EncDatumRow) (retErr error) {
310342
if len(row) != len(d.types) {
311343
log.Fatalf(ctx, "invalid row length %d, expected %d", len(row), len(d.types))

0 commit comments

Comments
 (0)