-
Notifications
You must be signed in to change notification settings - Fork 4.8k
HIVE-29617: Error while loading column statistics of Iceberg table after upgrading Hive #6496
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -239,22 +239,33 @@ | |
| return aggregateStat.getNumRows(); | ||
| } | ||
|
|
||
| private static void estimateStatsForMissingCols(List<String> neededColumns, List<ColStatistics> columnStats, | ||
| HiveConf conf, long nr, List<ColumnInfo> schema) { | ||
| /** | ||
| * Estimates column statistics for columns specified in {@code neededColumnNames} | ||
| * that do not already have statistics in the {@code existingColStats} list. | ||
| * | ||
| * @return A {@link List} of {@link ColStatistics} objects containing | ||
| * both the provided existing statistics and the newly estimated ones. | ||
| */ | ||
| static List<ColStatistics> estimateStatsForMissingCols( | ||
| List<String> neededColumnNames, List<ColStatistics> existingColStats, HiveConf conf, long nr, | ||
| List<ColumnInfo> schema) { | ||
|
|
||
| Set<String> neededCols = new HashSet<>(neededColumns); | ||
| Set<String> colsWithStats = new HashSet<>(); | ||
| Set<String> neededCols = new HashSet<>(neededColumnNames); | ||
| Set<String> columnNamesWithStats = new HashSet<>(existingColStats.size()); | ||
|
Check warning on line 254 in ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
|
||
|
|
||
| for (ColStatistics cstats : columnStats) { | ||
| colsWithStats.add(cstats.getColumnName()); | ||
| for (ColStatistics cstats : existingColStats) { | ||
| columnNamesWithStats.add(cstats.getColumnName()); | ||
| } | ||
|
|
||
| List<String> missingColStats = new ArrayList<>(Sets.difference(neededCols, colsWithStats)); | ||
| List<String> missingColumnNames = new ArrayList<>(Sets.difference(neededCols, columnNamesWithStats)); | ||
| ArrayList<ColStatistics> combined = new ArrayList<>(existingColStats.size() + missingColumnNames.size()); | ||
| combined.addAll(existingColStats); | ||
|
|
||
| if (!missingColStats.isEmpty()) { | ||
| columnStats.addAll( | ||
| estimateStats(schema, missingColStats, conf, nr)); | ||
| if (!missingColumnNames.isEmpty()) { | ||
| combined.addAll(estimateStats(schema, missingColumnNames, conf, nr)); | ||
| } | ||
|
|
||
| return combined; | ||
| } | ||
|
|
||
| public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, | ||
|
|
@@ -300,7 +311,7 @@ | |
| if (needColStats && !metaTable) { | ||
| colStats = getTableColumnStats(table, neededColumns, colStatsCache, fetchColStats); | ||
| if (estimateStats) { | ||
| estimateStatsForMissingCols(neededColumns, colStats, conf, nr, schema); | ||
| colStats = estimateStatsForMissingCols(neededColumns, colStats, conf, nr, schema); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I checked the usage of the |
||
| } | ||
| // we should have stats for all columns (estimated or actual) | ||
| if (neededColumns.size() == colStats.size()) { | ||
|
|
@@ -386,7 +397,7 @@ | |
| boolean statsRetrieved = aggrStats != null && | ||
| aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0; | ||
| if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) { | ||
| estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf, nr, schema); | ||
| columnStats = estimateStatsForMissingCols(neededColsToRetrieve, columnStats, conf, nr, schema); | ||
| // There are some partitions with no state (or we didn't fetch any state). | ||
| // Update the stats with empty list to reflect that in the | ||
| // state/initialize structures. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,8 @@ | |
|
|
||
| package org.apache.hadoop.hive.ql.stats; | ||
|
|
||
| import static org.junit.Assert.assertFalse; | ||
|
Check warning on line 21 in ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is unused, as highlighted by sonar too. |
||
| import static org.junit.Assert.assertTrue; | ||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| import static org.junit.jupiter.api.Assertions.assertNotEquals; | ||
| import static org.junit.jupiter.api.Assertions.assertNotNull; | ||
|
|
@@ -41,10 +43,12 @@ | |
| import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; | ||
| import org.apache.hadoop.hive.metastore.api.Timestamp; | ||
| import org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData; | ||
| import org.apache.hadoop.hive.ql.exec.ColumnInfo; | ||
| import org.apache.hadoop.hive.ql.plan.ColStatistics; | ||
| import org.apache.hadoop.hive.ql.plan.ColStatistics.Range; | ||
| import org.apache.hadoop.hive.ql.plan.Statistics; | ||
| import org.apache.hadoop.hive.serde.serdeConstants; | ||
| import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; | ||
| import org.junit.jupiter.api.Test; | ||
| import org.junit.jupiter.params.ParameterizedTest; | ||
| import org.junit.jupiter.params.provider.Arguments; | ||
|
|
@@ -565,4 +569,44 @@ | |
| assertEquals(1700000000L, range.maxValue.longValue(), "maxValue mismatch for TIMESTAMP"); | ||
| } | ||
|
|
||
| @Test | ||
| void testEstimateStatsForMissingColsHandlesEmptyList() { | ||
| HiveConf conf = new HiveConf(); | ||
|
|
||
| ColumnInfo columnInfoA = new ColumnInfo("a", TypeInfoFactory.intTypeInfo, "t", false); | ||
|
|
||
| List<ColStatistics> allColumnStats = StatsUtils.estimateStatsForMissingCols( | ||
| List.of("a"), Collections.emptyList(), conf, 0, List.of(columnInfoA)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think we can have a situation where |
||
|
|
||
| assertEquals(1, allColumnStats.size()); | ||
| } | ||
|
|
||
| @Test | ||
| void testEstimateStatsForMissingColsCombinesExistingStatsAndEstimations() { | ||
| HiveConf conf = new HiveConf(); | ||
|
|
||
| ColumnInfo colNeededButNotExists = new ColumnInfo("neededButNotExists", TypeInfoFactory.intTypeInfo, "t", false); | ||
| ColumnInfo colNeededAndExists = new ColumnInfo("neededAndExists", TypeInfoFactory.intTypeInfo, "t", false); | ||
| ColumnInfo colNotNeededButExists = new ColumnInfo("notNeededButExists", TypeInfoFactory.intTypeInfo, "t", false); | ||
| ColumnInfo colNotNeededNotExists = new ColumnInfo("notNeededNotExists", TypeInfoFactory.intTypeInfo, "t", false); | ||
|
|
||
| ColStatistics colStatNeededAndExists = new ColStatistics(); | ||
| colStatNeededAndExists.setColumnName(colNeededAndExists.getInternalName()); | ||
| ColStatistics colStatNotNeededButExists = new ColStatistics(); | ||
| colStatNotNeededButExists.setColumnName(colNotNeededButExists.getInternalName()); | ||
|
|
||
| List<ColStatistics> allColumnStats = StatsUtils.estimateStatsForMissingCols( | ||
| List.of(colNeededAndExists.getInternalName(), colNeededButNotExists.getInternalName()), | ||
| List.of(colStatNeededAndExists, colStatNotNeededButExists), | ||
| conf, | ||
| 0, | ||
| List.of(colNeededButNotExists, colNeededAndExists, colNotNeededButExists, colNotNeededNotExists)); | ||
|
|
||
| assertEquals(3, allColumnStats.size()); | ||
| assertEquals(allColumnStats.get(0), colStatNeededAndExists); | ||
| assertEquals(allColumnStats.get(1), colStatNotNeededButExists); | ||
| assertTrue(allColumnStats.get(2).isEstimated()); | ||
| assertEquals(allColumnStats.get(2).getColumnName(), colNeededButNotExists.getInternalName()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I think except the first |
||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: Sonar's suggestion of
HashSet.newHashSet(int numMappings)is more efficient. Not a blocker for this PR.