@@ -1804,5 +1804,236 @@ TEST(TestEncodedStatistics, ApplyStatSizeLimits) {
18041804 EXPECT_FALSE (statistics->HasMinMax ());
18051805}
18061806
1807+ // GH-47995: Empty string minimum should be preserved after merge
1808+ TEST (TestByteArrayStatistics, MergeWithEmptyStringMin) {
1809+ NodePtr node =
1810+ PrimitiveNode::Make (" StringColumn" , Repetition::OPTIONAL, Type::BYTE_ARRAY);
1811+ ColumnDescriptor descr (node, 1 , 1 );
1812+
1813+ // Create statistics with empty string as min, "zzz" as max
1814+ auto stats1 = MakeStatistics<ByteArrayType>(&descr);
1815+ std::vector<ByteArray> values1 = {ByteArray (" " ), ByteArray (" abc" ), ByteArray (" zzz" )};
1816+ stats1->Update (values1.data (), values1.size (), 0 );
1817+ ASSERT_TRUE (stats1->HasMinMax ());
1818+ EXPECT_EQ (stats1->min (), ByteArray (" " ));
1819+ EXPECT_EQ (stats1->max (), ByteArray (" zzz" ));
1820+
1821+ // Create statistics with "aaa" as min, "bbb" as max
1822+ auto stats2 = MakeStatistics<ByteArrayType>(&descr);
1823+ std::vector<ByteArray> values2 = {ByteArray (" aaa" ), ByteArray (" bbb" )};
1824+ stats2->Update (values2.data (), values2.size (), 0 );
1825+ ASSERT_TRUE (stats2->HasMinMax ());
1826+ EXPECT_EQ (stats2->min (), ByteArray (" aaa" ));
1827+ EXPECT_EQ (stats2->max (), ByteArray (" bbb" ));
1828+
1829+ // Merge stats2 into stats1: min should still be "", max should be "zzz"
1830+ stats1->Merge (*stats2);
1831+ ASSERT_TRUE (stats1->HasMinMax ());
1832+ EXPECT_EQ (stats1->min (), ByteArray (" " ));
1833+ EXPECT_EQ (stats1->max (), ByteArray (" zzz" ));
1834+
1835+ // Create fresh statistics and merge in opposite order
1836+ auto stats3 = MakeStatistics<ByteArrayType>(&descr);
1837+ std::vector<ByteArray> values3 = {ByteArray (" " ), ByteArray (" abc" ), ByteArray (" zzz" )};
1838+ stats3->Update (values3.data (), values3.size (), 0 );
1839+
1840+ auto stats4 = MakeStatistics<ByteArrayType>(&descr);
1841+ std::vector<ByteArray> values4 = {ByteArray (" aaa" ), ByteArray (" bbb" )};
1842+ stats4->Update (values4.data (), values4.size (), 0 );
1843+
1844+ // Merge stats3 into stats4: min should become "", max should become "zzz"
1845+ stats4->Merge (*stats3);
1846+ ASSERT_TRUE (stats4->HasMinMax ());
1847+ EXPECT_EQ (stats4->min (), ByteArray (" " ));
1848+ EXPECT_EQ (stats4->max (), ByteArray (" zzz" ));
1849+ }
1850+
1851+ // GH-47995: Comprehensive tests for ByteArray statistics merge with empty strings.
1852+ // Tests all combinations of (no min/max) vs (has min/max) with min="" or min="aaa".
1853+ TEST (TestByteArrayStatistics, MergeEmptyStringComprehensive) {
1854+ NodePtr node =
1855+ PrimitiveNode::Make (" StringColumn" , Repetition::OPTIONAL, Type::BYTE_ARRAY);
1856+ ColumnDescriptor descr (node, 1 , 1 );
1857+
1858+ auto make_empty_stats = [&]() { return MakeStatistics<ByteArrayType>(&descr); };
1859+
1860+ auto make_stats_with_empty_min = [&]() {
1861+ auto stats = MakeStatistics<ByteArrayType>(&descr);
1862+ std::vector<ByteArray> values = {ByteArray (" " ), ByteArray (" zzz" )};
1863+ stats->Update (values.data (), values.size (), 0 );
1864+ return stats;
1865+ };
1866+
1867+ auto make_stats_with_nonempty_min = [&]() {
1868+ auto stats = MakeStatistics<ByteArrayType>(&descr);
1869+ std::vector<ByteArray> values = {ByteArray (" aaa" ), ByteArray (" yyy" )};
1870+ stats->Update (values.data (), values.size (), 0 );
1871+ return stats;
1872+ };
1873+
1874+ // 1. no min/max, merge (no min/max) -> no min/max
1875+ {
1876+ auto stats = make_empty_stats ();
1877+ ASSERT_FALSE (stats->HasMinMax ());
1878+ stats->Merge (*make_empty_stats ());
1879+ ASSERT_FALSE (stats->HasMinMax ()) << " empty + empty = empty" ;
1880+ }
1881+
1882+ // 2. no min/max, merge (min="", max="zzz") -> min="", max="zzz"
1883+ {
1884+ auto stats = make_empty_stats ();
1885+ ASSERT_FALSE (stats->HasMinMax ());
1886+ stats->Merge (*make_stats_with_empty_min ());
1887+ ASSERT_TRUE (stats->HasMinMax ()) << " empty + has_minmax = has_minmax" ;
1888+ EXPECT_EQ (stats->min (), ByteArray (" " ));
1889+ EXPECT_EQ (stats->max (), ByteArray (" zzz" ));
1890+ }
1891+
1892+ // 3. no min/max, merge (min="aaa", max="yyy") -> min="aaa", max="yyy"
1893+ {
1894+ auto stats = make_empty_stats ();
1895+ ASSERT_FALSE (stats->HasMinMax ());
1896+ stats->Merge (*make_stats_with_nonempty_min ());
1897+ ASSERT_TRUE (stats->HasMinMax ()) << " empty + has_minmax = has_minmax" ;
1898+ EXPECT_EQ (stats->min (), ByteArray (" aaa" ));
1899+ EXPECT_EQ (stats->max (), ByteArray (" yyy" ));
1900+ }
1901+
1902+ // 3b. no min/max, merge (min="", max="") -> min="", max=""
1903+ {
1904+ auto stats = make_empty_stats ();
1905+ ASSERT_FALSE (stats->HasMinMax ());
1906+ auto other = MakeStatistics<ByteArrayType>(&descr);
1907+ std::vector<ByteArray> values = {ByteArray (" " )};
1908+ other->Update (values.data (), values.size (), 0 );
1909+ ASSERT_TRUE (other->HasMinMax ());
1910+ EXPECT_EQ (other->min (), ByteArray (" " ));
1911+ EXPECT_EQ (other->max (), ByteArray (" " ));
1912+ stats->Merge (*other);
1913+ ASSERT_TRUE (stats->HasMinMax ()) << " empty + (min='', max='') = has_minmax" ;
1914+ EXPECT_EQ (stats->min (), ByteArray (" " ));
1915+ EXPECT_EQ (stats->max (), ByteArray (" " ));
1916+ }
1917+
1918+ // 3c. (min="", max=""), merge (no min/max) -> min="", max=""
1919+ {
1920+ auto stats = MakeStatistics<ByteArrayType>(&descr);
1921+ std::vector<ByteArray> values = {ByteArray (" " )};
1922+ stats->Update (values.data (), values.size (), 0 );
1923+ ASSERT_TRUE (stats->HasMinMax ());
1924+ EXPECT_EQ (stats->min (), ByteArray (" " ));
1925+ EXPECT_EQ (stats->max (), ByteArray (" " ));
1926+ stats->Merge (*make_empty_stats ());
1927+ ASSERT_TRUE (stats->HasMinMax ()) << " (min='', max='') + empty = has_minmax" ;
1928+ EXPECT_EQ (stats->min (), ByteArray (" " ));
1929+ EXPECT_EQ (stats->max (), ByteArray (" " ));
1930+ }
1931+
1932+ // 4. (min="", max="zzz"), merge (no min/max) -> min="", max="zzz"
1933+ {
1934+ auto stats = make_stats_with_empty_min ();
1935+ ASSERT_TRUE (stats->HasMinMax ());
1936+ stats->Merge (*make_empty_stats ());
1937+ ASSERT_TRUE (stats->HasMinMax ()) << " has_minmax + empty = has_minmax" ;
1938+ EXPECT_EQ (stats->min (), ByteArray (" " ));
1939+ EXPECT_EQ (stats->max (), ByteArray (" zzz" ));
1940+ }
1941+
1942+ // 5. (min="", max="zzz"), merge (min="", max="yyy") -> min="", max="zzz"
1943+ {
1944+ auto stats = make_stats_with_empty_min ();
1945+ auto other = make_stats_with_empty_min ();
1946+ // Modify other to have different max for clarity
1947+ stats->Merge (*other);
1948+ ASSERT_TRUE (stats->HasMinMax ());
1949+ EXPECT_EQ (stats->min (), ByteArray (" " ));
1950+ EXPECT_EQ (stats->max (), ByteArray (" zzz" ));
1951+ }
1952+
1953+ // 6. (min="", max="zzz"), merge (min="aaa", max="yyy") -> min="", max="zzz"
1954+ {
1955+ auto stats = make_stats_with_empty_min ();
1956+ stats->Merge (*make_stats_with_nonempty_min ());
1957+ ASSERT_TRUE (stats->HasMinMax ());
1958+ EXPECT_EQ (stats->min (), ByteArray (" " )) << " empty string < 'aaa'" ;
1959+ EXPECT_EQ (stats->max (), ByteArray (" zzz" )) << " 'zzz' > 'yyy'" ;
1960+ }
1961+
1962+ // 7. (min="aaa", max="yyy"), merge (no min/max) -> min="aaa", max="yyy"
1963+ {
1964+ auto stats = make_stats_with_nonempty_min ();
1965+ stats->Merge (*make_empty_stats ());
1966+ ASSERT_TRUE (stats->HasMinMax ());
1967+ EXPECT_EQ (stats->min (), ByteArray (" aaa" ));
1968+ EXPECT_EQ (stats->max (), ByteArray (" yyy" ));
1969+ }
1970+
1971+ // 8. (min="aaa", max="yyy"), merge (min="", max="zzz") -> min="", max="zzz"
1972+ {
1973+ auto stats = make_stats_with_nonempty_min ();
1974+ stats->Merge (*make_stats_with_empty_min ());
1975+ ASSERT_TRUE (stats->HasMinMax ());
1976+ EXPECT_EQ (stats->min (), ByteArray (" " )) << " empty string < 'aaa'" ;
1977+ EXPECT_EQ (stats->max (), ByteArray (" zzz" )) << " 'zzz' > 'yyy'" ;
1978+ }
1979+
1980+ // 9. (min="aaa", max="yyy"), merge (min="bbb", max="xxx") -> min="aaa", max="yyy"
1981+ {
1982+ auto stats = make_stats_with_nonempty_min ();
1983+ auto other = MakeStatistics<ByteArrayType>(&descr);
1984+ std::vector<ByteArray> values = {ByteArray (" bbb" ), ByteArray (" xxx" )};
1985+ other->Update (values.data (), values.size (), 0 );
1986+ stats->Merge (*other);
1987+ ASSERT_TRUE (stats->HasMinMax ());
1988+ EXPECT_EQ (stats->min (), ByteArray (" aaa" ));
1989+ EXPECT_EQ (stats->max (), ByteArray (" yyy" ));
1990+ }
1991+ }
1992+
1993+ // GH-47995: Statistics from encoded empty string min should be preserved after merge.
1994+ // This test reproduces the bug where empty string min (encoded as "") results in
1995+ // ByteArray with ptr=nullptr after internal Copy(), which was incorrectly rejected
1996+ // by CleanStatistic().
1997+ TEST (TestByteArrayStatistics, MergeEncodedEmptyStringMin) {
1998+ NodePtr node =
1999+ PrimitiveNode::Make (" StringColumn" , Repetition::OPTIONAL, Type::BYTE_ARRAY);
2000+ ColumnDescriptor descr (node, 1 , 1 );
2001+
2002+ // Create statistics from encoded values (simulates reading from Parquet metadata)
2003+ // Empty string "" as min, "zzz" as max
2004+ std::string encoded_min = " " ; // empty string
2005+ std::string encoded_max = " zzz" ;
2006+ auto stats1 = Statistics::Make (&descr, encoded_min, encoded_max,
2007+ /* num_values=*/ 100 , /* null_count=*/ 0 ,
2008+ /* distinct_count=*/ 0 , /* has_min_max=*/ true ,
2009+ /* has_null_count=*/ true , /* has_distinct_count=*/ false );
2010+ auto typed_stats1 = std::dynamic_pointer_cast<TypedStatistics<ByteArrayType>>(stats1);
2011+ ASSERT_TRUE (typed_stats1->HasMinMax ());
2012+ EXPECT_EQ (typed_stats1->min (), ByteArray (" " ));
2013+ EXPECT_EQ (typed_stats1->max (), ByteArray (" zzz" ));
2014+
2015+ // Create second statistics with "aaa" as min, "bbb" as max
2016+ std::string encoded_min2 = " aaa" ;
2017+ std::string encoded_max2 = " bbb" ;
2018+ auto stats2 = Statistics::Make (&descr, encoded_min2, encoded_max2,
2019+ /* num_values=*/ 100 , /* null_count=*/ 0 ,
2020+ /* distinct_count=*/ 0 , /* has_min_max=*/ true ,
2021+ /* has_null_count=*/ true , /* has_distinct_count=*/ false );
2022+ auto typed_stats2 = std::dynamic_pointer_cast<TypedStatistics<ByteArrayType>>(stats2);
2023+ ASSERT_TRUE (typed_stats2->HasMinMax ());
2024+
2025+ // Create a fresh stats object and merge both
2026+ auto merged = MakeStatistics<ByteArrayType>(&descr);
2027+ merged->Merge (*typed_stats1);
2028+ ASSERT_TRUE (merged->HasMinMax ()) << " Min/max should be preserved after first merge" ;
2029+ EXPECT_EQ (merged->min (), ByteArray (" " ));
2030+ EXPECT_EQ (merged->max (), ByteArray (" zzz" ));
2031+
2032+ merged->Merge (*typed_stats2);
2033+ ASSERT_TRUE (merged->HasMinMax ()) << " Min/max should be preserved after second merge" ;
2034+ EXPECT_EQ (merged->min (), ByteArray (" " )); // empty string should still be min
2035+ EXPECT_EQ (merged->max (), ByteArray (" zzz" ));
2036+ }
2037+
18072038} // namespace test
18082039} // namespace parquet
0 commit comments