Skip to content

Commit 3bbeb2c

Browse files
macvincentfacebook-github-bot
authored andcommitted
Use string_view size for string stream memory accounting (facebookincubator#432)
Summary: The new `NullableContentStringStreamData` chunker was using `sizeof(uint64_t)` (8 bytes) when calculating chunk sizes and memory usage, the batch reader consume string data using `std::string_view` (16 bytes). This mismatch causes memory regressions because chunk size calculations underestimate the actual memory footprint, leading to larger-than-expected chunks being created. This change uses `sizeof(std::string_view)` consistently across the chunking and stream data layers to protect readers that use string views from unexpected memory growth Reviewed By: helfman Differential Revision: D91016115
1 parent 0ecb7cc commit 3bbeb2c

File tree

5 files changed

+61
-20
lines changed

5 files changed

+61
-20
lines changed

dwio/nimble/velox/StreamChunker.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,10 @@ class NullableContentStringStreamChunker final : public StreamChunker {
659659
if (nonNulls[idx]) {
660660
currentExtraMemory =
661661
stringLengths[lengthsOffset_ + chunkSize.dataElementCount];
662-
currentTotalSize += currentExtraMemory + sizeof(uint64_t);
662+
// NOTE: This is not a bug, we use sizeof(std::string_view) instead of
663+
// sizeof(uint64_t) to prevent reader regressions with the current
664+
// default writer settings.
665+
currentTotalSize += currentExtraMemory + sizeof(std::string_view);
663666
++currentDataCount;
664667
}
665668

dwio/nimble/velox/StreamData.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,13 +356,16 @@ class NullableContentStringStreamData final : public NullsStreamData {
356356
}
357357

358358
inline uint64_t memoryUsed() const override {
359+
// NOTE: This is not a bug, we multiply lengths_.size() by
360+
// sizeof(std::string_view) instead of sizeof(uint64_t) to prevent reader
361+
// regressions with the current default writer settings.
359362
return NullsStreamData::memoryUsed() + buffer_.size() +
360-
lengths_.size() * sizeof(uint64_t) +
363+
lengths_.size() * sizeof(std::string_view) +
361364
stringViews_.size() * sizeof(std::string_view);
362365
}
363366

364367
inline uint64_t bufferSize() const {
365-
return lengths_.size() * sizeof(uint64_t) + buffer_.size();
368+
return lengths_.size() * sizeof(std::string_view) + buffer_.size();
366369
}
367370

368371
inline StringBuffer mutableData() {

dwio/nimble/velox/tests/StreamChunkerTests.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ class StreamChunkerTestsBase : public ::testing::Test {
121121
std::vector<uint64_t>(lengths.begin(), lengths.end()),
122122
::testing::ElementsAreArray(expectedLength));
123123
expectedStreamDataSize = expectedBufferContent.size() +
124-
expectedLength.size() * sizeof(uint64_t);
124+
expectedLength.size() * sizeof(std::string_view);
125125
} else {
126126
NIMBLE_UNREACHABLE(
127127
"NullableContentStringStreamData can only be called on string data");
@@ -1139,16 +1139,16 @@ TEST_F(StreamChunkerTestsBase, nullableContentStringStreamChunking) {
11391139
populateStringData(data, testData);
11401140

11411141
// number of strings: 7
1142-
// size of string buffer length = 7 * 8 = 56 bytes
1142+
// size of string buffer length = 7 * 16 = 112 bytes
11431143
// size of nulls = 11 * 1 = 11 bytes
11441144
// total number of characters = 36 bytes
1145-
// total size of stream data = 56 + 11 + 36 = 103 bytes
1146-
ASSERT_EQ(stream.memoryUsed(), 103);
1145+
// total size of stream data = 112 + 11 + 36 = 159 bytes
1146+
ASSERT_EQ(stream.memoryUsed(), 159);
11471147

11481148
// Test 1: Not last chunk
11491149
{
1150-
const uint64_t maxChunkSize = 45;
1151-
const uint64_t minChunkSize = 40;
1150+
const uint64_t maxChunkSize = 72;
1151+
const uint64_t minChunkSize = 50;
11521152
auto chunker = getStreamChunker(
11531153
stream,
11541154
{
@@ -1220,7 +1220,8 @@ TEST_F(StreamChunkerTestsBase, nullableContentStringStreamChunking) {
12201220
populateData(smallNonNulls, smallNonNullsData);
12211221

12221222
// Exactly what is needed to fit just the first entry
1223-
const uint64_t minChunkSize = smallTestData.at(0).size() + sizeof(uint64_t);
1223+
const uint64_t minChunkSize =
1224+
smallTestData.at(0).size() + sizeof(std::string_view);
12241225
const uint64_t maxChunkSize = minChunkSize;
12251226
auto chunker = getStreamChunker(
12261227
stream,
@@ -1254,7 +1255,7 @@ TEST_F(StreamChunkerTestsBase, nullableContentStringStreamChunking) {
12541255

12551256
const uint64_t minChunkSize = 1;
12561257
// Size of "really really large string", string view, and null minus 1.
1257-
const uint64_t maxChunkSize = 24 + sizeof(uint64_t) + sizeof(bool);
1258+
const uint64_t maxChunkSize = 24 + sizeof(std::string_view) + sizeof(bool);
12581259
auto chunker = getStreamChunker(
12591260
stream,
12601261
{

dwio/nimble/velox/tests/StreamDataTest.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,8 @@ TEST_F(NullableContentStringStreamDataTest, basicOperations) {
548548
EXPECT_EQ(buffer.size(), 10);
549549
EXPECT_EQ(
550550
streamData.memoryUsed(),
551-
nonNulls.size() + buffer.size() + lengths.size() * sizeof(uint64_t));
551+
nonNulls.size() + buffer.size() +
552+
lengths.size() * sizeof(std::string_view));
552553

553554
streamData.materialize();
554555
const auto* views =

dwio/nimble/velox/tests/VeloxWriterTest.cpp

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "dwio/nimble/velox/ChunkedStream.h"
3131
#include "dwio/nimble/velox/EncodingLayoutTree.h"
3232
#include "dwio/nimble/velox/FlushPolicy.h"
33+
#include "dwio/nimble/velox/SchemaReader.h"
3334
#include "dwio/nimble/velox/SchemaSerialization.h"
3435
#include "dwio/nimble/velox/StatsGenerated.h"
3536
#include "dwio/nimble/velox/VeloxReader.h"
@@ -392,10 +393,27 @@ ChunkSizeResults validateChunkSize(
392393
nimble::VeloxReader& reader,
393394
const uint64_t minStreamChunkRawSize,
394395
const uint64_t maxStreamChunkRawSize) {
395-
const double kMaxErrorRate = 0.2;
396+
constexpr int kRowCountOffset = 2;
397+
constexpr double kMaxErrorRate = 0.2;
396398
const auto& tablet = reader.tabletReader();
397399
auto& pool = reader.memoryPool();
398400

401+
std::unordered_set<uint32_t> stringStreamOffsets;
402+
nimble::SchemaReader::traverseSchema(
403+
reader.schema(),
404+
[&](uint32_t /*level*/,
405+
const nimble::Type& type,
406+
const nimble::SchemaReader::NodeInfo& /*info*/) {
407+
if (type.isScalar()) {
408+
auto scalarKind = type.asScalar().scalarDescriptor().scalarKind();
409+
if (scalarKind == nimble::ScalarKind::String ||
410+
scalarKind == nimble::ScalarKind::Binary) {
411+
stringStreamOffsets.insert(
412+
type.asScalar().scalarDescriptor().offset());
413+
}
414+
}
415+
});
416+
399417
const uint32_t stripeCount = tablet.stripeCount();
400418
uint32_t maxChunkCount = 0;
401419
uint32_t minChunkCount = std::numeric_limits<uint32_t>::max();
@@ -415,21 +433,35 @@ ChunkSizeResults validateChunkSize(
415433
nimble::InMemoryChunkedStream chunkedStream{
416434
pool, std::move(streamLoaders[streamId])};
417435
uint32_t currentStreamChunkCount = 0;
436+
const bool isStringStream = stringStreamOffsets.contains(streamId);
418437
while (chunkedStream.hasNext()) {
419438
++currentStreamChunkCount;
420439
const auto chunk = chunkedStream.nextChunk();
421440
const uint64_t chunkRawDataSize =
422441
nimble::test::TestUtils::getRawDataSize(pool, chunk);
423-
EXPECT_LE(chunkRawDataSize, maxStreamChunkRawSize)
424-
<< "Stream " << streamId << " has a chunk with size "
425-
<< chunkRawDataSize << " which is above max chunk size of "
426-
<< maxStreamChunkRawSize;
442+
const uint32_t rowCount =
443+
*reinterpret_cast<const uint32_t*>(chunk.data() + kRowCountOffset);
444+
const double stringError =
445+
isStringStream ? rowCount * sizeof(std::string_view) : 0;
446+
447+
// For string streams, a single row may exceed max chunk size if the
448+
// string itself is larger than the limit. This is acceptable since we
449+
// cannot split a single string value.
450+
if (!(isStringStream && rowCount == 1)) {
451+
const double maxError =
452+
kMaxErrorRate * maxStreamChunkRawSize + stringError;
453+
EXPECT_LE(chunkRawDataSize, maxStreamChunkRawSize + maxError)
454+
<< "Stream " << streamId << " has a chunk with size "
455+
<< chunkRawDataSize << " which is above max chunk size of "
456+
<< maxStreamChunkRawSize;
457+
}
427458

428459
// Validate min chunk size when not last chunk
429460
if (chunkedStream.hasNext() &&
430461
chunkRawDataSize < minStreamChunkRawSize) {
431-
uint64_t difference = minStreamChunkRawSize - chunkRawDataSize;
432-
EXPECT_LE(difference * 1.0 / minStreamChunkRawSize, kMaxErrorRate)
462+
const double minError =
463+
kMaxErrorRate * minStreamChunkRawSize + stringError;
464+
EXPECT_GE(chunkRawDataSize, minStreamChunkRawSize - minError)
433465
<< "Stream " << streamId << " has a non-last chunk with size "
434466
<< chunkRawDataSize << " which is below min chunk size of "
435467
<< minStreamChunkRawSize;
@@ -2248,7 +2280,8 @@ TEST_F(VeloxWriterTest, fuzzComplex) {
22482280

22492281
const auto iterations = 20;
22502282
// provide sufficient buffer between min and max chunk size thresholds
2251-
constexpr uint64_t chunkThresholdBuffer = sizeof(int64_t) + sizeof(bool);
2283+
constexpr uint64_t chunkThresholdBuffer =
2284+
sizeof(std::string_view) + sizeof(bool);
22522285
for (auto i = 0; i < iterations; ++i) {
22532286
writerOptions.minStreamChunkRawSize =
22542287
std::uniform_int_distribution<uint64_t>(10, 4096)(rng);

0 commit comments

Comments
 (0)