apache
diff --git a/‎ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java‎
Lines changed: 12 additions & 3 deletions b/‎ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java‎
Lines changed: 77 additions & 0 deletions b/‎ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎ql/src/test/results/clientpositive/llap/parquet_vectorization_15.q.out‎
Lines changed: 7 additions & 7 deletions b/‎ql/src/test/results/clientpositive/llap/parquet_vectorization_15.q.out‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎ql/src/test/results/clientpositive/llap/parquet_vectorization_16.q.out‎
Lines changed: 5 additions & 5 deletions b/‎ql/src/test/results/clientpositive/llap/parquet_vectorization_16.q.out‎
Lines changed: 5 additions & 5 deletions
@@ -832,6 +832,7 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String col
       cs.setNumNulls(csd.getBinaryStats().getNumNulls());
     } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) {
       cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp());
+      cs.setCountDistint(csd.getTimestampStats().getNumDVs());
       cs.setNumNulls(csd.getTimestampStats().getNumNulls());
       Long lowVal = (csd.getTimestampStats().getLowValue() != null) ? csd.getTimestampStats().getLowValue()
           .getSecondsSinceEpoch() : null;
@@ -862,6 +863,7 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String col
       cs.setHistogram(csd.getDecimalStats().getHistogram());
     } else if (colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
       cs.setAvgColLen(JavaDataModel.get().lengthOfDate());
+      cs.setCountDistint(csd.getDateStats().getNumDVs());
       cs.setNumNulls(csd.getDateStats().getNumNulls());
       Long lowVal = (csd.getDateStats().getLowValue() != null) ? csd.getDateStats().getLowValue()
           .getDaysSinceEpoch() : null;
@@ -2087,9 +2089,16 @@ private static List<Long> extractNDVGroupingColumns(List<ColStatistics> colStats
     for (ColStatistics cs : colStats) {
       if (cs != null) {
         long ndv = cs.getCountDistint();
-        // Only increment ndv value if it is "known"
-        if (ndv > 0 && cs.getNumNulls() > 0) {
-          ndv = StatsUtils.safeAdd(ndv, 1);
+
+        if (ndv == 0L) {
+          // Typically, ndv == 0 means "NDV unknown", and no safe GROUPBY adjustments are possible
+          // However, there is a special exception for "constant NULL" columns. They are intentionally generated
+          // with NDV values of 0 and numNulls == numRows, while their actual NDV is 1
+          if (cs.getNumNulls() >= parentStats.getNumRows()) {
+            ndv = 1L;
+          }
+        } else if (cs.getNumNulls() > 0L) {
+          ndv = StatsUtils.safeAdd(ndv, 1L);
         }
         ndvValues.add(ndv);
       } else {
 
@@ -25,17 +25,24 @@
 
 import java.lang.reflect.Field;
 import java.lang.reflect.Modifier;
+import java.util.Collections;
+import java.util.List;
 import java.util.Set;
 import java.util.stream.Stream;
 
 import org.apache.commons.lang3.reflect.FieldUtils;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.Date;
+import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
 import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
 import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
+import org.apache.hadoop.hive.metastore.api.Timestamp;
+import org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData;
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
 import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
+import org.apache.hadoop.hive.ql.plan.Statistics;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
@@ -244,4 +251,74 @@ static Stream<Arguments> floatingPointStatisticsTestData() {
     );
   }
 
+  @Test
+  void testGetColStatisticsTimestampType() {
+    ColumnStatisticsObj cso = new ColumnStatisticsObj();
+    cso.setColName("ts_col");
+    cso.setColType(serdeConstants.TIMESTAMP_TYPE_NAME);
+
+    TimestampColumnStatsData tsStats = new TimestampColumnStatsData();
+    tsStats.setNumDVs(35);
+    tsStats.setNumNulls(5);
+    tsStats.setLowValue(new Timestamp(1000));
+    tsStats.setHighValue(new Timestamp(2000));
+
+    ColumnStatisticsData data = new ColumnStatisticsData();
+    data.setTimestampStats(tsStats);
+    cso.setStatsData(data);
+
+    ColStatistics cs = StatsUtils.getColStatistics(cso, "ts_col");
+
+    assertNotNull(cs, "ColStatistics should not be null");
+    assertEquals(35, cs.getCountDistint(), "TIMESTAMP NumDVs should be extracted from metastore stats");
+    assertEquals(5, cs.getNumNulls(), "NumNulls mismatch");
+  }
+
+  @Test
+  void testGetColStatisticsDateType() {
+    ColumnStatisticsObj cso = new ColumnStatisticsObj();
+    cso.setColName("date_col");
+    cso.setColType(serdeConstants.DATE_TYPE_NAME);
+
+    DateColumnStatsData dateStats = new DateColumnStatsData();
+    dateStats.setNumDVs(42);
+    dateStats.setNumNulls(3);
+    dateStats.setLowValue(new Date(18000));
+    dateStats.setHighValue(new Date(19000));
+
+    ColumnStatisticsData data = new ColumnStatisticsData();
+    data.setDateStats(dateStats);
+    cso.setStatsData(data);
+
+    ColStatistics cs = StatsUtils.getColStatistics(cso, "date_col");
+
+    assertNotNull(cs, "ColStatistics should not be null");
+    assertEquals(42, cs.getCountDistint(), "DATE NumDVs should be extracted from metastore stats");
+    assertEquals(3, cs.getNumNulls(), "NumNulls mismatch");
+  }
+
+  private ColStatistics createColStats(String name, long ndv, long numNulls) {
+    ColStatistics cs = new ColStatistics(name, "string");
+    cs.setCountDistint(ndv);
+    cs.setNumNulls(numNulls);
+    return cs;
+  }
+
+  private Statistics createParentStats(long numRows) {
+    Statistics stats = new Statistics(numRows, 0, 0, 0);
+    stats.setColumnStatsState(Statistics.State.COMPLETE);
+    return stats;
+  }
+
+  @Test
+  void testComputeNDVGroupingColumnsPartialStats() {
+    ColStatistics cs = createColStats("partial_stats_col", 0, 100);
+    Statistics parentStats = createParentStats(1000);
+    List<ColStatistics> colStats = Collections.singletonList(cs);
+
+    long ndv = StatsUtils.computeNDVGroupingColumns(colStats, parentStats, false);
+
+    assertEquals(0, ndv, "Partial stats (ndv=0, numNulls<numRows) should return 0, not inflate to 1");
+  }
+
 }
@@ -119,7 +119,7 @@ STAGE PLANS:
                         minReductionHashAggr: 0.4
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16
-                        Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 12288 Data size: 2432638 Basic stats: COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           key expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp)
                           null sort order: zzzzzzz
@@ -129,7 +129,7 @@ STAGE PLANS:
                               className: VectorReduceSinkMultiKeyOperator
                               native: true
                               nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
-                          Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 12288 Data size: 2432638 Basic stats: COMPLETE Column stats: COMPLETE
                           value expressions: _col7 (type: double), _col8 (type: double), _col9 (type: bigint), _col10 (type: double), _col11 (type: double), _col12 (type: double), _col13 (type: bigint), _col14 (type: double), _col15 (type: double), _col16 (type: bigint)
             Execution mode: vectorized, llap
             LLAP IO: all inputs (cache only)
@@ -154,16 +154,16 @@ STAGE PLANS:
                 keys: KEY._col0 (type: float), KEY._col1 (type: boolean), KEY._col2 (type: double), KEY._col3 (type: string), KEY._col4 (type: tinyint), KEY._col5 (type: int), KEY._col6 (type: timestamp)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16
-                Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 12288 Data size: 2432638 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp), power(((_col7 - ((_col8 * _col8) / _col9)) / if((_col9 = 1L), null, (_col9 - 1))), 0.5) (type: double), (-26.28 - CAST( _col5 AS decimal(10,0))) (type: decimal(13,2)), _col10 (type: double), (_col2 * 79.553D) (type: double), (33.0 % _col0) (type: float), power(((_col11 - ((_col12 * _col12) / _col13)) / if((_col13 = 1L), null, (_col13 - 1))), 0.5) (type: double), ((_col11 - ((_col12 * _col12) / _col13)) / _col13) (type: double), (-23.0D % _col2) (type: double), (- _col4) (type: tinyint), ((_col14 - ((_col15 * _col15) / _col16)) / if((_col16 = 1L), null, (_col16 - 1))) (type: double), (UDFToFloat(_col5) - _col0) (type: float), (-23 % UDFToInteger(_col4)) (type: int), (- (-26.28 - CAST( _col5 AS decimal(10,0)))) (type: decimal(13,2)), power(((_col14 - ((_col15 * _col15) / _col16)) / _col16), 0.5) (type: double)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20
-                  Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 12288 Data size: 5185150 Basic stats: COMPLETE Column stats: COMPLETE
                   Reduce Output Operator
                     key expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp)
                     null sort order: zzzzzzz
                     sort order: +++++++
-                    Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 12288 Data size: 5185150 Basic stats: COMPLETE Column stats: COMPLETE
                     value expressions: _col7 (type: double), _col8 (type: decimal(13,2)), _col9 (type: double), _col10 (type: double), _col11 (type: float), _col12 (type: double), _col13 (type: double), _col14 (type: double), _col15 (type: tinyint), _col16 (type: double), _col17 (type: float), _col18 (type: int), _col19 (type: decimal(13,2)), _col20 (type: double)
         Reducer 3 
             Execution mode: llap
@@ -175,10 +175,10 @@ STAGE PLANS:
               Select Operator
                 expressions: KEY.reducesinkkey0 (type: float), KEY.reducesinkkey1 (type: boolean), KEY.reducesinkkey2 (type: double), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: tinyint), KEY.reducesinkkey5 (type: int), KEY.reducesinkkey6 (type: timestamp), VALUE._col0 (type: double), VALUE._col1 (type: decimal(13,2)), VALUE._col2 (type: double), VALUE._col3 (type: double), VALUE._col4 (type: float), VALUE._col5 (type: double), VALUE._col6 (type: double), VALUE._col7 (type: double), VALUE._col8 (type: tinyint), VALUE._col9 (type: double), VALUE._col10 (type: float), VALUE._col11 (type: int), VALUE._col12 (type: decimal(13,2)), VALUE._col13 (type: double)
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20
-                Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 12288 Data size: 5185150 Basic stats: COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 12288 Data size: 5185150 Basic stats: COMPLETE Column stats: COMPLETE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
 
@@ -96,7 +96,7 @@ STAGE PLANS:
                         minReductionHashAggr: 0.4
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
-                        Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 6144 Data size: 848064 Basic stats: COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           key expressions: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp)
                           null sort order: zzz
@@ -106,7 +106,7 @@ STAGE PLANS:
                               className: VectorReduceSinkMultiKeyOperator
                               native: true
                               nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
-                          Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 6144 Data size: 848064 Basic stats: COMPLETE Column stats: COMPLETE
                           value expressions: _col3 (type: bigint), _col4 (type: double), _col5 (type: double), _col6 (type: double)
             Execution mode: vectorized, llap
             LLAP IO: all inputs (cache only)
@@ -141,7 +141,7 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string), KEY._col1 (type: double), KEY._col2 (type: timestamp)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
-                Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 6144 Data size: 848064 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp), (_col1 - 9763215.5639D) (type: double), (- (_col1 - 9763215.5639D)) (type: double), _col3 (type: bigint), power((greatest(0,(_col4 - ((_col5 * _col5) / _col3))) / if((_col3 = 1L), null, (_col3 - 1))), 0.5) (type: double), (- power((greatest(0,(_col4 - ((_col5 * _col5) / _col3))) / if((_col3 = 1L), null, (_col3 - 1))), 0.5)) (type: double), (power((greatest(0,(_col4 - ((_col5 * _col5) / _col3))) / if((_col3 = 1L), null, (_col3 - 1))), 0.5) * UDFToDouble(_col3)) (type: double), _col6 (type: double), (9763215.5639D / _col1) (type: double), (CAST( _col3 AS decimal(19,0)) / -1.389) (type: decimal(28,6)), power((greatest(0,(_col4 - ((_col5 * _col5) / _col3))) / if((_col3 = 1L), null, (_col3 - 1))), 0.5) (type: double)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
@@ -150,13 +150,13 @@ STAGE PLANS:
                       native: true
                       projectedOutputColumnNums: [0, 1, 2, 7, 9, 3, 18, 28, 39, 6, 40, 42, 51]
                       selectExpressions: DoubleColSubtractDoubleScalar(col 1:double, val 9763215.5639) -> 7:double, DoubleColUnaryMinus(col 8:double)(children: DoubleColSubtractDoubleScalar(col 1:double, val 9763215.5639) -> 8:double) -> 9:double, FuncPowerDoubleToDouble(col 17:double)(children: DoubleColDivideLongColumn(col 13:double, col 16:bigint)(children: VectorUDFAdaptor(greatest(0,(_col4 - ((_col5 * _col5) / _col3))))(children: DoubleColSubtractDoubleColumn(col 4:double, col 11:double)(children: DoubleColDivideLongColumn(col 10:double, col 3:bigint)(children: DoubleColMultiplyDoubleColumn(col 5:double, col 5:double) -> 10:double) -> 11:double) -> 12:double) -> 13:double, IfExprNullCondExpr(col 14:boolean, null, col 15:bigint)(children: LongColEqualLongScalar(col 3:bigint, val 1) -> 14:boolean, LongColSubtractLongScalar(col 3:bigint, val 1) -> 15:bigint) -> 16:bigint) -> 17:double) -> 18:double, DoubleColUnaryMinus(col 27:double)(children: FuncPowerDoubleToDouble(col 26:double)(children: DoubleColDivideLongColumn(col 22:double, col 25:bigint)(children: VectorUDFAdaptor(greatest(0,(_col4 - ((_col5 * _col5) / _col3))))(children: DoubleColSubtractDoubleColumn(col 4:double, col 20:double)(children: DoubleColDivideLongColumn(col 19:double, col 3:bigint)(children: DoubleColMultiplyDoubleColumn(col 5:double, col 5:double) -> 19:double) -> 20:double) -> 21:double) -> 22:double, IfExprNullCondExpr(col 23:boolean, null, col 24:bigint)(children: LongColEqualLongScalar(col 3:bigint, val 1) -> 23:boolean, LongColSubtractLongScalar(col 3:bigint, val 1) -> 24:bigint) -> 25:bigint) -> 26:double) -> 27:double) -> 28:double, DoubleColMultiplyDoubleColumn(col 37:double, col 38:double)(children: FuncPowerDoubleToDouble(col 36:double)(children: DoubleColDivideLongColumn(col 32:double, col 35:bigint)(children: VectorUDFAdaptor(greatest(0,(_col4 - ((_col5 * _col5) / _col3))))(children: DoubleColSubtractDoubleColumn(col 4:double, col 30:double)(children: DoubleColDivideLongColumn(col 29:double, col 3:bigint)(children: DoubleColMultiplyDoubleColumn(col 5:double, col 5:double) -> 29:double) -> 30:double) -> 31:double) -> 32:double, IfExprNullCondExpr(col 33:boolean, null, col 34:bigint)(children: LongColEqualLongScalar(col 3:bigint, val 1) -> 33:boolean, LongColSubtractLongScalar(col 3:bigint, val 1) -> 34:bigint) -> 35:bigint) -> 36:double) -> 37:double, CastLongToDouble(col 3:bigint) -> 38:double) -> 39:double, DoubleScalarDivideDoubleColumn(val 9763215.5639, col 1:double) -> 40:double, DecimalColDivideDecimalScalar(col 41:decimal(19,0), val -1.389)(children: CastLongToDecimal(col 3:bigint) -> 41:decimal(19,0)) -> 42:decimal(28,6), FuncPowerDoubleToDouble(col 50:double)(children: DoubleColDivideLongColumn(col 46:double, col 49:bigint)(children: VectorUDFAdaptor(greatest(0,(_col4 - ((_col5 * _col5) / _col3))))(children: DoubleColSubtractDoubleColumn(col 4:double, col 44:double)(children: DoubleColDivideLongColumn(col 43:double, col 3:bigint)(children: DoubleColMultiplyDoubleColumn(col 5:double, col 5:double) -> 43:double) -> 44:double) -> 45:double) -> 46:double, IfExprNullCondExpr(col 47:boolean, null, col 48:bigint)(children: LongColEqualLongScalar(col 3:bigint, val 1) -> 47:boolean, LongColSubtractLongScalar(col 3:bigint, val 1) -> 48:bigint) -> 49:bigint) -> 50:double) -> 51:double
-                  Statistics: Num rows: 5979 Data size: 1734126 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 6144 Data size: 1781952 Basic stats: COMPLETE Column stats: COMPLETE
                   File Output Operator
                     compressed: false
                     File Sink Vectorization:
                         className: VectorFileSinkOperator
                         native: false
-                    Statistics: Num rows: 5979 Data size: 1734126 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 6144 Data size: 1781952 Basic stats: COMPLETE Column stats: COMPLETE
                     table:
                         input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                         output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat