From 08bc734a5c11367a149c9e4ee6798facb19dd34b Mon Sep 17 00:00:00 2001 From: kasakrisz Date: Mon, 18 May 2026 12:52:44 +0200 Subject: [PATCH 1/4] HIVE-29617: Error while loading column statistics of Iceberg table after upgrading Hive --- .../hadoop/hive/ql/stats/StatsUtils.java | 35 ++++++++++----- .../hadoop/hive/ql/stats/TestStatsUtils.java | 44 +++++++++++++++++++ 2 files changed, 67 insertions(+), 12 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 55f9d0c1e158..979dd11227bd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -239,22 +239,33 @@ public static long getNumRows(HiveConf conf, List schema, Table tabl return aggregateStat.getNumRows(); } - private static void estimateStatsForMissingCols(List neededColumns, List columnStats, - HiveConf conf, long nr, List schema) { + /** + * Estimates column statistics for columns specified in {@code neededColumnNames} + * that do not already have statistics in the {@code existingColStats} list. + * + * @return A {@link List} of {@link ColStatistics} objects containing + * both the provided existing statistics and the newly estimated ones. + */ + static List estimateStatsForMissingCols( + List neededColumnNames, List existingColStats, HiveConf conf, long nr, + List schema) { - Set neededCols = new HashSet<>(neededColumns); - Set colsWithStats = new HashSet<>(); + Set neededCols = new HashSet<>(neededColumnNames); + Set columnNamesWithStats = new HashSet<>(existingColStats.size()); - for (ColStatistics cstats : columnStats) { - colsWithStats.add(cstats.getColumnName()); + for (ColStatistics cstats : existingColStats) { + columnNamesWithStats.add(cstats.getColumnName()); } - List missingColStats = new ArrayList<>(Sets.difference(neededCols, colsWithStats)); + List missingColumnNames = new ArrayList<>(Sets.difference(neededCols, columnNamesWithStats)); + ArrayList combined = new ArrayList<>(existingColStats.size() + missingColumnNames.size()); + combined.addAll(existingColStats); - if (!missingColStats.isEmpty()) { - columnStats.addAll( - estimateStats(schema, missingColStats, conf, nr)); + if (!missingColumnNames.isEmpty()) { + combined.addAll(estimateStats(schema, missingColumnNames, conf, nr)); } + + return combined; } public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, @@ -300,7 +311,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p if (needColStats && !metaTable) { colStats = getTableColumnStats(table, neededColumns, colStatsCache, fetchColStats); if (estimateStats) { - estimateStatsForMissingCols(neededColumns, colStats, conf, nr, schema); + colStats = estimateStatsForMissingCols(neededColumns, colStats, conf, nr, schema); } // we should have stats for all columns (estimated or actual) if (neededColumns.size() == colStats.size()) { @@ -386,7 +397,7 @@ private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList p boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0; if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) { - estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf, nr, schema); + columnStats = estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf, nr, schema); // There are some partitions with no state (or we didn't fetch any state). // Update the stats with empty list to reflect that in the // state/initialize structures. diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java index c009472fed0a..dfa4f8e02af7 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java @@ -18,6 +18,8 @@ package org.apache.hadoop.hive.ql.stats; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -41,10 +43,12 @@ import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; import org.apache.hadoop.hive.metastore.api.Timestamp; import org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.plan.ColStatistics.Range; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; @@ -565,4 +569,44 @@ void testGetColStatisticsTimestampType() { assertEquals(1700000000L, range.maxValue.longValue(), "maxValue mismatch for TIMESTAMP"); } + @Test + void testEstimateStatsForMissingColsHandlesEmptyList() { + HiveConf conf = new HiveConf(); + + ColumnInfo columnInfoA = new ColumnInfo("a", TypeInfoFactory.intTypeInfo, "t", false); + + List allColumnStats = StatsUtils.estimateStatsForMissingCols( + List.of("a"), Collections.emptyList(), conf, 0, List.of(columnInfoA)); + + assertEquals(1, allColumnStats.size()); + } + + @Test + void testEstimateStatsForMissingColsCombinesExistingStatsAndEstimations() { + HiveConf conf = new HiveConf(); + + ColumnInfo colNeededButNotExists = new ColumnInfo("neededButNotExists", TypeInfoFactory.intTypeInfo, "t", false); + ColumnInfo colNeededAndExists = new ColumnInfo("neededAndExists", TypeInfoFactory.intTypeInfo, "t", false); + ColumnInfo colNotNeededButExists = new ColumnInfo("notNeededButExists", TypeInfoFactory.intTypeInfo, "t", false); + ColumnInfo colNotNeededNotExists = new ColumnInfo("notNeededNotExists", TypeInfoFactory.intTypeInfo, "t", false); + + ColStatistics colStatNeededAndExists = new ColStatistics(); + colStatNeededAndExists.setColumnName(colNeededAndExists.getInternalName()); + ColStatistics colStatNotNeededButExists = new ColStatistics(); + colStatNotNeededButExists.setColumnName(colNotNeededButExists.getInternalName()); + + List allColumnStats = StatsUtils.estimateStatsForMissingCols( + List.of(colNeededAndExists.getInternalName(), colNeededButNotExists.getInternalName()), + List.of(colStatNeededAndExists, colStatNotNeededButExists), + conf, + 0, + List.of(colNeededButNotExists, colNeededAndExists, colNotNeededButExists, colNotNeededNotExists)); + + assertEquals(3, allColumnStats.size()); + assertEquals(allColumnStats.get(0), colStatNeededAndExists); + assertEquals(allColumnStats.get(1), colStatNotNeededButExists); + assertTrue(allColumnStats.get(2).isEstimated()); + assertEquals(allColumnStats.get(2).getColumnName(), colNeededButNotExists.getInternalName()); + } + } From 26c44caf96e52bba71f54c0887e5a04882792950 Mon Sep 17 00:00:00 2001 From: kasakrisz Date: Mon, 1 Jun 2026 10:45:42 +0200 Subject: [PATCH 2/4] address review comments: add test when needed column list is empty --- .../hadoop/hive/ql/stats/TestStatsUtils.java | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java index dfa4f8e02af7..813fa9709758 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java @@ -18,7 +18,6 @@ package org.apache.hadoop.hive.ql.stats; -import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; @@ -609,4 +608,25 @@ void testEstimateStatsForMissingColsCombinesExistingStatsAndEstimations() { assertEquals(allColumnStats.get(2).getColumnName(), colNeededButNotExists.getInternalName()); } + @Test + void testEstimateStatsForMissingColsReturnOnlyColumnsWithExistingStatsWhenNoNeededColumn() { + HiveConf conf = new HiveConf(); + + ColumnInfo colNotNeededButExists = new ColumnInfo("notNeededButExists", TypeInfoFactory.intTypeInfo, "t", false); + ColumnInfo colNotNeededNotExists = new ColumnInfo("notNeededNotExists", TypeInfoFactory.intTypeInfo, "t", false); + + ColStatistics colStatNotNeededButExists = new ColStatistics(); + colStatNotNeededButExists.setColumnName(colNotNeededButExists.getInternalName()); + + List allColumnStats = StatsUtils.estimateStatsForMissingCols( + Collections.emptyList(), + List.of(colStatNotNeededButExists), + conf, + 0, + List.of(colNotNeededButExists, colNotNeededNotExists)); + + assertEquals(1, allColumnStats.size()); + assertEquals(allColumnStats.getFirst(), colStatNotNeededButExists); + } + } From 52a3dfbc480f3771cdf38e2da307a2565276af06 Mon Sep 17 00:00:00 2001 From: kasakrisz Date: Mon, 1 Jun 2026 11:01:08 +0200 Subject: [PATCH 3/4] address review comments: add assertions for estimate flag --- ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java | 2 +- .../test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 979dd11227bd..e539f0ab9c4a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -251,7 +251,7 @@ static List estimateStatsForMissingCols( List schema) { Set neededCols = new HashSet<>(neededColumnNames); - Set columnNamesWithStats = new HashSet<>(existingColStats.size()); + Set columnNamesWithStats = HashSet.newHashSet(existingColStats.size()); for (ColStatistics cstats : existingColStats) { columnNamesWithStats.add(cstats.getColumnName()); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java index 813fa9709758..0fef2103e213 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hive.ql.stats; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; @@ -603,9 +604,11 @@ void testEstimateStatsForMissingColsCombinesExistingStatsAndEstimations() { assertEquals(3, allColumnStats.size()); assertEquals(allColumnStats.get(0), colStatNeededAndExists); + assertFalse(allColumnStats.get(0).isEstimated()); assertEquals(allColumnStats.get(1), colStatNotNeededButExists); - assertTrue(allColumnStats.get(2).isEstimated()); + assertFalse(allColumnStats.get(1).isEstimated()); assertEquals(allColumnStats.get(2).getColumnName(), colNeededButNotExists.getInternalName()); + assertTrue(allColumnStats.get(2).isEstimated()); } @Test From 08e3d8fd25d134e5f88cfdf6e710c8e90c461720 Mon Sep 17 00:00:00 2001 From: kasakrisz Date: Mon, 1 Jun 2026 11:05:41 +0200 Subject: [PATCH 4/4] address review comments: fix argument order of assertEquals --- .../org/apache/hadoop/hive/ql/stats/TestStatsUtils.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java index 0fef2103e213..2faefafdee35 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java @@ -603,11 +603,11 @@ void testEstimateStatsForMissingColsCombinesExistingStatsAndEstimations() { List.of(colNeededButNotExists, colNeededAndExists, colNotNeededButExists, colNotNeededNotExists)); assertEquals(3, allColumnStats.size()); - assertEquals(allColumnStats.get(0), colStatNeededAndExists); + assertEquals(colStatNeededAndExists, allColumnStats.get(0)); assertFalse(allColumnStats.get(0).isEstimated()); - assertEquals(allColumnStats.get(1), colStatNotNeededButExists); + assertEquals(colStatNotNeededButExists, allColumnStats.get(1)); assertFalse(allColumnStats.get(1).isEstimated()); - assertEquals(allColumnStats.get(2).getColumnName(), colNeededButNotExists.getInternalName()); + assertEquals(colNeededButNotExists.getInternalName(), allColumnStats.get(2).getColumnName()); assertTrue(allColumnStats.get(2).isEstimated()); }