Skip to content

Commit d96beea

Browse files
macvincentfacebook-github-bot
authored andcommitted
Use string_view size for string stream memory accounting
Summary: The `NullableContentStringStreamData` chunker was using `sizeof(uint64_t)` (8 bytes) when calculating chunk sizes and memory usage, the batch reader consume string data using `std::string_view` (16 bytes). This mismatch causes memory regressions because chunk size calculations underestimate the actual memory footprint, leading to larger-than-expected chunks being created. This change uses `sizeof(std::string_view)` consistently across the chunking and stream data layers to protect readers that use string views from unexpected memory growth Differential Revision: D91016115
1 parent 0ecb7cc commit d96beea

File tree

4 files changed

+13
-11
lines changed

4 files changed

+13
-11
lines changed

dwio/nimble/velox/StreamChunker.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,7 @@ class NullableContentStringStreamChunker final : public StreamChunker {
659659
if (nonNulls[idx]) {
660660
currentExtraMemory =
661661
stringLengths[lengthsOffset_ + chunkSize.dataElementCount];
662-
currentTotalSize += currentExtraMemory + sizeof(uint64_t);
662+
currentTotalSize += currentExtraMemory + sizeof(std::string_view);
663663
++currentDataCount;
664664
}
665665

dwio/nimble/velox/StreamData.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ class NullableContentStringStreamData final : public NullsStreamData {
357357

358358
inline uint64_t memoryUsed() const override {
359359
return NullsStreamData::memoryUsed() + buffer_.size() +
360-
lengths_.size() * sizeof(uint64_t) +
360+
lengths_.size() * sizeof(std::string_view) +
361361
stringViews_.size() * sizeof(std::string_view);
362362
}
363363

dwio/nimble/velox/tests/StreamChunkerTests.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ class StreamChunkerTestsBase : public ::testing::Test {
121121
std::vector<uint64_t>(lengths.begin(), lengths.end()),
122122
::testing::ElementsAreArray(expectedLength));
123123
expectedStreamDataSize = expectedBufferContent.size() +
124-
expectedLength.size() * sizeof(uint64_t);
124+
expectedLength.size() * sizeof(std::string_view);
125125
} else {
126126
NIMBLE_UNREACHABLE(
127127
"NullableContentStringStreamData can only be called on string data");
@@ -1139,16 +1139,16 @@ TEST_F(StreamChunkerTestsBase, nullableContentStringStreamChunking) {
11391139
populateStringData(data, testData);
11401140

11411141
// number of strings: 7
1142-
// size of string buffer length = 7 * 8 = 56 bytes
1142+
// size of string buffer length = 7 * 16 = 112 bytes
11431143
// size of nulls = 11 * 1 = 11 bytes
11441144
// total number of characters = 36 bytes
1145-
// total size of stream data = 56 + 11 + 36 = 103 bytes
1146-
ASSERT_EQ(stream.memoryUsed(), 103);
1145+
// total size of stream data = 112 + 11 + 36 = 159 bytes
1146+
ASSERT_EQ(stream.memoryUsed(), 159);
11471147

11481148
// Test 1: Not last chunk
11491149
{
1150-
const uint64_t maxChunkSize = 45;
1151-
const uint64_t minChunkSize = 40;
1150+
const uint64_t maxChunkSize = 72;
1151+
const uint64_t minChunkSize = 50;
11521152
auto chunker = getStreamChunker(
11531153
stream,
11541154
{
@@ -1220,7 +1220,8 @@ TEST_F(StreamChunkerTestsBase, nullableContentStringStreamChunking) {
12201220
populateData(smallNonNulls, smallNonNullsData);
12211221

12221222
// Exactly what is needed to fit just the first entry
1223-
const uint64_t minChunkSize = smallTestData.at(0).size() + sizeof(uint64_t);
1223+
const uint64_t minChunkSize =
1224+
smallTestData.at(0).size() + sizeof(std::string_view);
12241225
const uint64_t maxChunkSize = minChunkSize;
12251226
auto chunker = getStreamChunker(
12261227
stream,
@@ -1254,7 +1255,7 @@ TEST_F(StreamChunkerTestsBase, nullableContentStringStreamChunking) {
12541255

12551256
const uint64_t minChunkSize = 1;
12561257
// Size of "really really large string", string view, and null minus 1.
1257-
const uint64_t maxChunkSize = 24 + sizeof(uint64_t) + sizeof(bool);
1258+
const uint64_t maxChunkSize = 24 + sizeof(std::string_view) + sizeof(bool);
12581259
auto chunker = getStreamChunker(
12591260
stream,
12601261
{

dwio/nimble/velox/tests/StreamDataTest.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,8 @@ TEST_F(NullableContentStringStreamDataTest, basicOperations) {
548548
EXPECT_EQ(buffer.size(), 10);
549549
EXPECT_EQ(
550550
streamData.memoryUsed(),
551-
nonNulls.size() + buffer.size() + lengths.size() * sizeof(uint64_t));
551+
nonNulls.size() + buffer.size() +
552+
lengths.size() * sizeof(std::string_view));
552553

553554
streamData.materialize();
554555
const auto* views =

0 commit comments

Comments
 (0)