Skip to content

Commit 21debc3

Browse files
committed
GH-47995: [C++][Parquet] Fix empty string min/max statistics being lost during merge
Prior to this change, CleanStatistic() for ByteArray rejected statistics where ptr == nullptr. However, empty strings can have ptr == nullptr with len == 0, causing valid statistics to be discarded when the minimum value is an empty string. The fix introduces a sentinel pointer (kNoValueSentinel) distinct from nullptr to mark "no value" in ByteArray statistics. This allows CleanStatistic to distinguish between "no min/max computed" (sentinel) and "min/max is empty string" (nullptr with len=0). FLBA is unchanged since it has fixed length and no "empty" concept.
1 parent a9f0a4a commit 21debc3

File tree

2 files changed

+260
-2
lines changed

2 files changed

+260
-2
lines changed

cpp/src/parquet/statistics.cc

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@ namespace {
5555
constexpr int value_length(int value_length, const ByteArray& value) { return value.len; }
5656
constexpr int value_length(int type_length, const FLBA& value) { return type_length; }
5757

58+
// Sentinel pointer to mark "no value" for ByteArray statistics.
59+
// Distinct from nullptr, which is valid for empty strings.
60+
// See: https://github.com/apache/arrow/issues/47995
61+
inline constexpr uint8_t kNoValueSentinelBytes[1] = {0};
62+
inline constexpr const uint8_t* kNoValueSentinel = kNoValueSentinelBytes;
63+
5864
// Static "constants" for normalizing float16 min/max values. These need to be expressed
5965
// as pointers because `Float16LogicalType` represents an FLBA.
6066
struct Float16Constants {
@@ -290,7 +296,26 @@ struct BinaryLikeCompareHelperBase {
290296

291297
template <bool is_signed>
292298
struct CompareHelper<ByteArrayType, is_signed>
293-
: public BinaryLikeCompareHelperBase<ByteArrayType, is_signed> {};
299+
: public BinaryLikeCompareHelperBase<ByteArrayType, is_signed> {
300+
using Base = BinaryLikeCompareHelperBase<ByteArrayType, is_signed>;
301+
using T = ByteArray;
302+
303+
// Use kNoValueSentinel instead of nullptr to distinguish "no value" from empty string.
304+
static T DefaultMin() { return T{0, kNoValueSentinel}; }
305+
static T DefaultMax() { return T{0, kNoValueSentinel}; }
306+
307+
static T Min(int type_length, const T& a, const T& b) {
308+
if (a.ptr == kNoValueSentinel) return b;
309+
if (b.ptr == kNoValueSentinel) return a;
310+
return Base::Compare(type_length, a, b) ? a : b;
311+
}
312+
313+
static T Max(int type_length, const T& a, const T& b) {
314+
if (a.ptr == kNoValueSentinel) return b;
315+
if (b.ptr == kNoValueSentinel) return a;
316+
return Base::Compare(type_length, a, b) ? b : a;
317+
}
318+
};
294319

295320
template <bool is_signed>
296321
struct CompareHelper<FLBAType, is_signed>
@@ -412,7 +437,9 @@ optional<std::pair<FLBA, FLBA>> CleanStatistic(std::pair<FLBA, FLBA> min_max,
412437

413438
optional<std::pair<ByteArray, ByteArray>> CleanStatistic(
414439
std::pair<ByteArray, ByteArray> min_max, LogicalType::Type::type) {
415-
if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
440+
// Check for kNoValueSentinel (not nullptr) because nullptr is valid for empty strings.
441+
// See: https://github.com/apache/arrow/issues/47995
442+
if (min_max.first.ptr == kNoValueSentinel || min_max.second.ptr == kNoValueSentinel) {
416443
return ::std::nullopt;
417444
}
418445
return min_max;

cpp/src/parquet/statistics_test.cc

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1804,5 +1804,236 @@ TEST(TestEncodedStatistics, ApplyStatSizeLimits) {
18041804
EXPECT_FALSE(statistics->HasMinMax());
18051805
}
18061806

1807+
// GH-47995: Empty string minimum should be preserved after merge
1808+
TEST(TestByteArrayStatistics, MergeWithEmptyStringMin) {
1809+
NodePtr node =
1810+
PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY);
1811+
ColumnDescriptor descr(node, 1, 1);
1812+
1813+
// Create statistics with empty string as min, "zzz" as max
1814+
auto stats1 = MakeStatistics<ByteArrayType>(&descr);
1815+
std::vector<ByteArray> values1 = {ByteArray(""), ByteArray("abc"), ByteArray("zzz")};
1816+
stats1->Update(values1.data(), values1.size(), 0);
1817+
ASSERT_TRUE(stats1->HasMinMax());
1818+
EXPECT_EQ(stats1->min(), ByteArray(""));
1819+
EXPECT_EQ(stats1->max(), ByteArray("zzz"));
1820+
1821+
// Create statistics with "aaa" as min, "bbb" as max
1822+
auto stats2 = MakeStatistics<ByteArrayType>(&descr);
1823+
std::vector<ByteArray> values2 = {ByteArray("aaa"), ByteArray("bbb")};
1824+
stats2->Update(values2.data(), values2.size(), 0);
1825+
ASSERT_TRUE(stats2->HasMinMax());
1826+
EXPECT_EQ(stats2->min(), ByteArray("aaa"));
1827+
EXPECT_EQ(stats2->max(), ByteArray("bbb"));
1828+
1829+
// Merge stats2 into stats1: min should still be "", max should be "zzz"
1830+
stats1->Merge(*stats2);
1831+
ASSERT_TRUE(stats1->HasMinMax());
1832+
EXPECT_EQ(stats1->min(), ByteArray(""));
1833+
EXPECT_EQ(stats1->max(), ByteArray("zzz"));
1834+
1835+
// Create fresh statistics and merge in opposite order
1836+
auto stats3 = MakeStatistics<ByteArrayType>(&descr);
1837+
std::vector<ByteArray> values3 = {ByteArray(""), ByteArray("abc"), ByteArray("zzz")};
1838+
stats3->Update(values3.data(), values3.size(), 0);
1839+
1840+
auto stats4 = MakeStatistics<ByteArrayType>(&descr);
1841+
std::vector<ByteArray> values4 = {ByteArray("aaa"), ByteArray("bbb")};
1842+
stats4->Update(values4.data(), values4.size(), 0);
1843+
1844+
// Merge stats3 into stats4: min should become "", max should become "zzz"
1845+
stats4->Merge(*stats3);
1846+
ASSERT_TRUE(stats4->HasMinMax());
1847+
EXPECT_EQ(stats4->min(), ByteArray(""));
1848+
EXPECT_EQ(stats4->max(), ByteArray("zzz"));
1849+
}
1850+
1851+
// GH-47995: Comprehensive tests for ByteArray statistics merge with empty strings.
1852+
// Tests all combinations of (no min/max) vs (has min/max) with min="" or min="aaa".
1853+
TEST(TestByteArrayStatistics, MergeEmptyStringComprehensive) {
1854+
NodePtr node =
1855+
PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY);
1856+
ColumnDescriptor descr(node, 1, 1);
1857+
1858+
auto make_empty_stats = [&]() { return MakeStatistics<ByteArrayType>(&descr); };
1859+
1860+
auto make_stats_with_empty_min = [&]() {
1861+
auto stats = MakeStatistics<ByteArrayType>(&descr);
1862+
std::vector<ByteArray> values = {ByteArray(""), ByteArray("zzz")};
1863+
stats->Update(values.data(), values.size(), 0);
1864+
return stats;
1865+
};
1866+
1867+
auto make_stats_with_nonempty_min = [&]() {
1868+
auto stats = MakeStatistics<ByteArrayType>(&descr);
1869+
std::vector<ByteArray> values = {ByteArray("aaa"), ByteArray("yyy")};
1870+
stats->Update(values.data(), values.size(), 0);
1871+
return stats;
1872+
};
1873+
1874+
// 1. no min/max, merge (no min/max) -> no min/max
1875+
{
1876+
auto stats = make_empty_stats();
1877+
ASSERT_FALSE(stats->HasMinMax());
1878+
stats->Merge(*make_empty_stats());
1879+
ASSERT_FALSE(stats->HasMinMax()) << "empty + empty = empty";
1880+
}
1881+
1882+
// 2. no min/max, merge (min="", max="zzz") -> min="", max="zzz"
1883+
{
1884+
auto stats = make_empty_stats();
1885+
ASSERT_FALSE(stats->HasMinMax());
1886+
stats->Merge(*make_stats_with_empty_min());
1887+
ASSERT_TRUE(stats->HasMinMax()) << "empty + has_minmax = has_minmax";
1888+
EXPECT_EQ(stats->min(), ByteArray(""));
1889+
EXPECT_EQ(stats->max(), ByteArray("zzz"));
1890+
}
1891+
1892+
// 3. no min/max, merge (min="aaa", max="yyy") -> min="aaa", max="yyy"
1893+
{
1894+
auto stats = make_empty_stats();
1895+
ASSERT_FALSE(stats->HasMinMax());
1896+
stats->Merge(*make_stats_with_nonempty_min());
1897+
ASSERT_TRUE(stats->HasMinMax()) << "empty + has_minmax = has_minmax";
1898+
EXPECT_EQ(stats->min(), ByteArray("aaa"));
1899+
EXPECT_EQ(stats->max(), ByteArray("yyy"));
1900+
}
1901+
1902+
// 3b. no min/max, merge (min="", max="") -> min="", max=""
1903+
{
1904+
auto stats = make_empty_stats();
1905+
ASSERT_FALSE(stats->HasMinMax());
1906+
auto other = MakeStatistics<ByteArrayType>(&descr);
1907+
std::vector<ByteArray> values = {ByteArray("")};
1908+
other->Update(values.data(), values.size(), 0);
1909+
ASSERT_TRUE(other->HasMinMax());
1910+
EXPECT_EQ(other->min(), ByteArray(""));
1911+
EXPECT_EQ(other->max(), ByteArray(""));
1912+
stats->Merge(*other);
1913+
ASSERT_TRUE(stats->HasMinMax()) << "empty + (min='', max='') = has_minmax";
1914+
EXPECT_EQ(stats->min(), ByteArray(""));
1915+
EXPECT_EQ(stats->max(), ByteArray(""));
1916+
}
1917+
1918+
// 3c. (min="", max=""), merge (no min/max) -> min="", max=""
1919+
{
1920+
auto stats = MakeStatistics<ByteArrayType>(&descr);
1921+
std::vector<ByteArray> values = {ByteArray("")};
1922+
stats->Update(values.data(), values.size(), 0);
1923+
ASSERT_TRUE(stats->HasMinMax());
1924+
EXPECT_EQ(stats->min(), ByteArray(""));
1925+
EXPECT_EQ(stats->max(), ByteArray(""));
1926+
stats->Merge(*make_empty_stats());
1927+
ASSERT_TRUE(stats->HasMinMax()) << "(min='', max='') + empty = has_minmax";
1928+
EXPECT_EQ(stats->min(), ByteArray(""));
1929+
EXPECT_EQ(stats->max(), ByteArray(""));
1930+
}
1931+
1932+
// 4. (min="", max="zzz"), merge (no min/max) -> min="", max="zzz"
1933+
{
1934+
auto stats = make_stats_with_empty_min();
1935+
ASSERT_TRUE(stats->HasMinMax());
1936+
stats->Merge(*make_empty_stats());
1937+
ASSERT_TRUE(stats->HasMinMax()) << "has_minmax + empty = has_minmax";
1938+
EXPECT_EQ(stats->min(), ByteArray(""));
1939+
EXPECT_EQ(stats->max(), ByteArray("zzz"));
1940+
}
1941+
1942+
// 5. (min="", max="zzz"), merge (min="", max="yyy") -> min="", max="zzz"
1943+
{
1944+
auto stats = make_stats_with_empty_min();
1945+
auto other = make_stats_with_empty_min();
1946+
// Modify other to have different max for clarity
1947+
stats->Merge(*other);
1948+
ASSERT_TRUE(stats->HasMinMax());
1949+
EXPECT_EQ(stats->min(), ByteArray(""));
1950+
EXPECT_EQ(stats->max(), ByteArray("zzz"));
1951+
}
1952+
1953+
// 6. (min="", max="zzz"), merge (min="aaa", max="yyy") -> min="", max="zzz"
1954+
{
1955+
auto stats = make_stats_with_empty_min();
1956+
stats->Merge(*make_stats_with_nonempty_min());
1957+
ASSERT_TRUE(stats->HasMinMax());
1958+
EXPECT_EQ(stats->min(), ByteArray("")) << "empty string < 'aaa'";
1959+
EXPECT_EQ(stats->max(), ByteArray("zzz")) << "'zzz' > 'yyy'";
1960+
}
1961+
1962+
// 7. (min="aaa", max="yyy"), merge (no min/max) -> min="aaa", max="yyy"
1963+
{
1964+
auto stats = make_stats_with_nonempty_min();
1965+
stats->Merge(*make_empty_stats());
1966+
ASSERT_TRUE(stats->HasMinMax());
1967+
EXPECT_EQ(stats->min(), ByteArray("aaa"));
1968+
EXPECT_EQ(stats->max(), ByteArray("yyy"));
1969+
}
1970+
1971+
// 8. (min="aaa", max="yyy"), merge (min="", max="zzz") -> min="", max="zzz"
1972+
{
1973+
auto stats = make_stats_with_nonempty_min();
1974+
stats->Merge(*make_stats_with_empty_min());
1975+
ASSERT_TRUE(stats->HasMinMax());
1976+
EXPECT_EQ(stats->min(), ByteArray("")) << "empty string < 'aaa'";
1977+
EXPECT_EQ(stats->max(), ByteArray("zzz")) << "'zzz' > 'yyy'";
1978+
}
1979+
1980+
// 9. (min="aaa", max="yyy"), merge (min="bbb", max="xxx") -> min="aaa", max="yyy"
1981+
{
1982+
auto stats = make_stats_with_nonempty_min();
1983+
auto other = MakeStatistics<ByteArrayType>(&descr);
1984+
std::vector<ByteArray> values = {ByteArray("bbb"), ByteArray("xxx")};
1985+
other->Update(values.data(), values.size(), 0);
1986+
stats->Merge(*other);
1987+
ASSERT_TRUE(stats->HasMinMax());
1988+
EXPECT_EQ(stats->min(), ByteArray("aaa"));
1989+
EXPECT_EQ(stats->max(), ByteArray("yyy"));
1990+
}
1991+
}
1992+
1993+
// GH-47995: Statistics from encoded empty string min should be preserved after merge.
1994+
// This test reproduces the bug where empty string min (encoded as "") results in
1995+
// ByteArray with ptr=nullptr after internal Copy(), which was incorrectly rejected
1996+
// by CleanStatistic().
1997+
TEST(TestByteArrayStatistics, MergeEncodedEmptyStringMin) {
1998+
NodePtr node =
1999+
PrimitiveNode::Make("StringColumn", Repetition::OPTIONAL, Type::BYTE_ARRAY);
2000+
ColumnDescriptor descr(node, 1, 1);
2001+
2002+
// Create statistics from encoded values (simulates reading from Parquet metadata)
2003+
// Empty string "" as min, "zzz" as max
2004+
std::string encoded_min = ""; // empty string
2005+
std::string encoded_max = "zzz";
2006+
auto stats1 = Statistics::Make(&descr, encoded_min, encoded_max,
2007+
/*num_values=*/100, /*null_count=*/0,
2008+
/*distinct_count=*/0, /*has_min_max=*/true,
2009+
/*has_null_count=*/true, /*has_distinct_count=*/false);
2010+
auto typed_stats1 = std::dynamic_pointer_cast<TypedStatistics<ByteArrayType>>(stats1);
2011+
ASSERT_TRUE(typed_stats1->HasMinMax());
2012+
EXPECT_EQ(typed_stats1->min(), ByteArray(""));
2013+
EXPECT_EQ(typed_stats1->max(), ByteArray("zzz"));
2014+
2015+
// Create second statistics with "aaa" as min, "bbb" as max
2016+
std::string encoded_min2 = "aaa";
2017+
std::string encoded_max2 = "bbb";
2018+
auto stats2 = Statistics::Make(&descr, encoded_min2, encoded_max2,
2019+
/*num_values=*/100, /*null_count=*/0,
2020+
/*distinct_count=*/0, /*has_min_max=*/true,
2021+
/*has_null_count=*/true, /*has_distinct_count=*/false);
2022+
auto typed_stats2 = std::dynamic_pointer_cast<TypedStatistics<ByteArrayType>>(stats2);
2023+
ASSERT_TRUE(typed_stats2->HasMinMax());
2024+
2025+
// Create a fresh stats object and merge both
2026+
auto merged = MakeStatistics<ByteArrayType>(&descr);
2027+
merged->Merge(*typed_stats1);
2028+
ASSERT_TRUE(merged->HasMinMax()) << "Min/max should be preserved after first merge";
2029+
EXPECT_EQ(merged->min(), ByteArray(""));
2030+
EXPECT_EQ(merged->max(), ByteArray("zzz"));
2031+
2032+
merged->Merge(*typed_stats2);
2033+
ASSERT_TRUE(merged->HasMinMax()) << "Min/max should be preserved after second merge";
2034+
EXPECT_EQ(merged->min(), ByteArray("")); // empty string should still be min
2035+
EXPECT_EQ(merged->max(), ByteArray("zzz"));
2036+
}
2037+
18072038
} // namespace test
18082039
} // namespace parquet

0 commit comments

Comments
 (0)