Skip to content

Commit 75dbdf8

Browse files
committed
HIVE-29368: further tuning NDV handling, including reading stats for timestamp/date columns
1 parent bd86e3c commit 75dbdf8

File tree

11 files changed

+145
-59
lines changed

11 files changed

+145
-59
lines changed

ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -832,6 +832,7 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String col
832832
cs.setNumNulls(csd.getBinaryStats().getNumNulls());
833833
} else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) {
834834
cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp());
835+
cs.setCountDistint(csd.getTimestampStats().getNumDVs());
835836
cs.setNumNulls(csd.getTimestampStats().getNumNulls());
836837
Long lowVal = (csd.getTimestampStats().getLowValue() != null) ? csd.getTimestampStats().getLowValue()
837838
.getSecondsSinceEpoch() : null;
@@ -862,6 +863,7 @@ public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String col
862863
cs.setHistogram(csd.getDecimalStats().getHistogram());
863864
} else if (colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
864865
cs.setAvgColLen(JavaDataModel.get().lengthOfDate());
866+
cs.setCountDistint(csd.getDateStats().getNumDVs());
865867
cs.setNumNulls(csd.getDateStats().getNumNulls());
866868
Long lowVal = (csd.getDateStats().getLowValue() != null) ? csd.getDateStats().getLowValue()
867869
.getDaysSinceEpoch() : null;
@@ -2087,9 +2089,16 @@ private static List<Long> extractNDVGroupingColumns(List<ColStatistics> colStats
20872089
for (ColStatistics cs : colStats) {
20882090
if (cs != null) {
20892091
long ndv = cs.getCountDistint();
2090-
// Only increment ndv value if it is "known"
2091-
if (ndv > 0 && cs.getNumNulls() > 0) {
2092-
ndv = StatsUtils.safeAdd(ndv, 1);
2092+
2093+
if (ndv == 0L) {
2094+
// Typically, ndv == 0 means "NDV unknown", and no safe GROUPBY adjustments are possible
2095+
// However, there is a special exception for "constant NULL" columns. They are intentionally generated
2096+
// with NDV values of 0 and numNulls == numRows, while their actual NDV is 1
2097+
if (cs.getNumNulls() >= parentStats.getNumRows()) {
2098+
ndv = 1L;
2099+
}
2100+
} else if (cs.getNumNulls() > 0L) {
2101+
ndv = StatsUtils.safeAdd(ndv, 1L);
20932102
}
20942103
ndvValues.add(ndv);
20952104
} else {

ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,24 @@
2525

2626
import java.lang.reflect.Field;
2727
import java.lang.reflect.Modifier;
28+
import java.util.Collections;
29+
import java.util.List;
2830
import java.util.Set;
2931
import java.util.stream.Stream;
3032

3133
import org.apache.commons.lang3.reflect.FieldUtils;
3234
import org.apache.hadoop.hive.conf.HiveConf;
3335
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
3436
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
37+
import org.apache.hadoop.hive.metastore.api.Date;
38+
import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
3539
import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
3640
import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
41+
import org.apache.hadoop.hive.metastore.api.Timestamp;
42+
import org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData;
3743
import org.apache.hadoop.hive.ql.plan.ColStatistics;
3844
import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
45+
import org.apache.hadoop.hive.ql.plan.Statistics;
3946
import org.apache.hadoop.hive.serde.serdeConstants;
4047
import org.junit.jupiter.api.Test;
4148
import org.junit.jupiter.params.ParameterizedTest;
@@ -244,4 +251,74 @@ static Stream<Arguments> floatingPointStatisticsTestData() {
244251
);
245252
}
246253

254+
@Test
255+
void testGetColStatisticsTimestampType() {
256+
ColumnStatisticsObj cso = new ColumnStatisticsObj();
257+
cso.setColName("ts_col");
258+
cso.setColType(serdeConstants.TIMESTAMP_TYPE_NAME);
259+
260+
TimestampColumnStatsData tsStats = new TimestampColumnStatsData();
261+
tsStats.setNumDVs(35);
262+
tsStats.setNumNulls(5);
263+
tsStats.setLowValue(new Timestamp(1000));
264+
tsStats.setHighValue(new Timestamp(2000));
265+
266+
ColumnStatisticsData data = new ColumnStatisticsData();
267+
data.setTimestampStats(tsStats);
268+
cso.setStatsData(data);
269+
270+
ColStatistics cs = StatsUtils.getColStatistics(cso, "ts_col");
271+
272+
assertNotNull(cs, "ColStatistics should not be null");
273+
assertEquals(35, cs.getCountDistint(), "TIMESTAMP NumDVs should be extracted from metastore stats");
274+
assertEquals(5, cs.getNumNulls(), "NumNulls mismatch");
275+
}
276+
277+
@Test
278+
void testGetColStatisticsDateType() {
279+
ColumnStatisticsObj cso = new ColumnStatisticsObj();
280+
cso.setColName("date_col");
281+
cso.setColType(serdeConstants.DATE_TYPE_NAME);
282+
283+
DateColumnStatsData dateStats = new DateColumnStatsData();
284+
dateStats.setNumDVs(42);
285+
dateStats.setNumNulls(3);
286+
dateStats.setLowValue(new Date(18000));
287+
dateStats.setHighValue(new Date(19000));
288+
289+
ColumnStatisticsData data = new ColumnStatisticsData();
290+
data.setDateStats(dateStats);
291+
cso.setStatsData(data);
292+
293+
ColStatistics cs = StatsUtils.getColStatistics(cso, "date_col");
294+
295+
assertNotNull(cs, "ColStatistics should not be null");
296+
assertEquals(42, cs.getCountDistint(), "DATE NumDVs should be extracted from metastore stats");
297+
assertEquals(3, cs.getNumNulls(), "NumNulls mismatch");
298+
}
299+
300+
private ColStatistics createColStats(String name, long ndv, long numNulls) {
301+
ColStatistics cs = new ColStatistics(name, "string");
302+
cs.setCountDistint(ndv);
303+
cs.setNumNulls(numNulls);
304+
return cs;
305+
}
306+
307+
private Statistics createParentStats(long numRows) {
308+
Statistics stats = new Statistics(numRows, 0, 0, 0);
309+
stats.setColumnStatsState(Statistics.State.COMPLETE);
310+
return stats;
311+
}
312+
313+
@Test
314+
void testComputeNDVGroupingColumnsPartialStats() {
315+
ColStatistics cs = createColStats("partial_stats_col", 0, 100);
316+
Statistics parentStats = createParentStats(1000);
317+
List<ColStatistics> colStats = Collections.singletonList(cs);
318+
319+
long ndv = StatsUtils.computeNDVGroupingColumns(colStats, parentStats, false);
320+
321+
assertEquals(0, ndv, "Partial stats (ndv=0, numNulls<numRows) should return 0, not inflate to 1");
322+
}
323+
247324
}

ql/src/test/results/clientpositive/llap/parquet_vectorization_15.q.out

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ STAGE PLANS:
119119
minReductionHashAggr: 0.4
120120
mode: hash
121121
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16
122-
Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE
122+
Statistics: Num rows: 12288 Data size: 2432638 Basic stats: COMPLETE Column stats: COMPLETE
123123
Reduce Output Operator
124124
key expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp)
125125
null sort order: zzzzzzz
@@ -129,7 +129,7 @@ STAGE PLANS:
129129
className: VectorReduceSinkMultiKeyOperator
130130
native: true
131131
nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
132-
Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE
132+
Statistics: Num rows: 12288 Data size: 2432638 Basic stats: COMPLETE Column stats: COMPLETE
133133
value expressions: _col7 (type: double), _col8 (type: double), _col9 (type: bigint), _col10 (type: double), _col11 (type: double), _col12 (type: double), _col13 (type: bigint), _col14 (type: double), _col15 (type: double), _col16 (type: bigint)
134134
Execution mode: vectorized, llap
135135
LLAP IO: all inputs (cache only)
@@ -154,16 +154,16 @@ STAGE PLANS:
154154
keys: KEY._col0 (type: float), KEY._col1 (type: boolean), KEY._col2 (type: double), KEY._col3 (type: string), KEY._col4 (type: tinyint), KEY._col5 (type: int), KEY._col6 (type: timestamp)
155155
mode: mergepartial
156156
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16
157-
Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE
157+
Statistics: Num rows: 12288 Data size: 2432638 Basic stats: COMPLETE Column stats: COMPLETE
158158
Select Operator
159159
expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp), power(((_col7 - ((_col8 * _col8) / _col9)) / if((_col9 = 1L), null, (_col9 - 1))), 0.5) (type: double), (-26.28 - CAST( _col5 AS decimal(10,0))) (type: decimal(13,2)), _col10 (type: double), (_col2 * 79.553D) (type: double), (33.0 % _col0) (type: float), power(((_col11 - ((_col12 * _col12) / _col13)) / if((_col13 = 1L), null, (_col13 - 1))), 0.5) (type: double), ((_col11 - ((_col12 * _col12) / _col13)) / _col13) (type: double), (-23.0D % _col2) (type: double), (- _col4) (type: tinyint), ((_col14 - ((_col15 * _col15) / _col16)) / if((_col16 = 1L), null, (_col16 - 1))) (type: double), (UDFToFloat(_col5) - _col0) (type: float), (-23 % UDFToInteger(_col4)) (type: int), (- (-26.28 - CAST( _col5 AS decimal(10,0)))) (type: decimal(13,2)), power(((_col14 - ((_col15 * _col15) / _col16)) / _col16), 0.5) (type: double)
160160
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20
161-
Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
161+
Statistics: Num rows: 12288 Data size: 5185150 Basic stats: COMPLETE Column stats: COMPLETE
162162
Reduce Output Operator
163163
key expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp)
164164
null sort order: zzzzzzz
165165
sort order: +++++++
166-
Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
166+
Statistics: Num rows: 12288 Data size: 5185150 Basic stats: COMPLETE Column stats: COMPLETE
167167
value expressions: _col7 (type: double), _col8 (type: decimal(13,2)), _col9 (type: double), _col10 (type: double), _col11 (type: float), _col12 (type: double), _col13 (type: double), _col14 (type: double), _col15 (type: tinyint), _col16 (type: double), _col17 (type: float), _col18 (type: int), _col19 (type: decimal(13,2)), _col20 (type: double)
168168
Reducer 3
169169
Execution mode: llap
@@ -175,10 +175,10 @@ STAGE PLANS:
175175
Select Operator
176176
expressions: KEY.reducesinkkey0 (type: float), KEY.reducesinkkey1 (type: boolean), KEY.reducesinkkey2 (type: double), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: tinyint), KEY.reducesinkkey5 (type: int), KEY.reducesinkkey6 (type: timestamp), VALUE._col0 (type: double), VALUE._col1 (type: decimal(13,2)), VALUE._col2 (type: double), VALUE._col3 (type: double), VALUE._col4 (type: float), VALUE._col5 (type: double), VALUE._col6 (type: double), VALUE._col7 (type: double), VALUE._col8 (type: tinyint), VALUE._col9 (type: double), VALUE._col10 (type: float), VALUE._col11 (type: int), VALUE._col12 (type: decimal(13,2)), VALUE._col13 (type: double)
177177
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20
178-
Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
178+
Statistics: Num rows: 12288 Data size: 5185150 Basic stats: COMPLETE Column stats: COMPLETE
179179
File Output Operator
180180
compressed: false
181-
Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
181+
Statistics: Num rows: 12288 Data size: 5185150 Basic stats: COMPLETE Column stats: COMPLETE
182182
table:
183183
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
184184
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat

ql/src/test/results/clientpositive/llap/parquet_vectorization_16.q.out

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ STAGE PLANS:
9696
minReductionHashAggr: 0.4
9797
mode: hash
9898
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
99-
Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE
99+
Statistics: Num rows: 6144 Data size: 848064 Basic stats: COMPLETE Column stats: COMPLETE
100100
Reduce Output Operator
101101
key expressions: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp)
102102
null sort order: zzz
@@ -106,7 +106,7 @@ STAGE PLANS:
106106
className: VectorReduceSinkMultiKeyOperator
107107
native: true
108108
nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
109-
Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE
109+
Statistics: Num rows: 6144 Data size: 848064 Basic stats: COMPLETE Column stats: COMPLETE
110110
value expressions: _col3 (type: bigint), _col4 (type: double), _col5 (type: double), _col6 (type: double)
111111
Execution mode: vectorized, llap
112112
LLAP IO: all inputs (cache only)
@@ -141,7 +141,7 @@ STAGE PLANS:
141141
keys: KEY._col0 (type: string), KEY._col1 (type: double), KEY._col2 (type: timestamp)
142142
mode: mergepartial
143143
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
144-
Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE
144+
Statistics: Num rows: 6144 Data size: 848064 Basic stats: COMPLETE Column stats: COMPLETE
145145
Select Operator
146146
expressions: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp), (_col1 - 9763215.5639D) (type: double), (- (_col1 - 9763215.5639D)) (type: double), _col3 (type: bigint), power((greatest(0,(_col4 - ((_col5 * _col5) / _col3))) / if((_col3 = 1L), null, (_col3 - 1))), 0.5) (type: double), (- power((greatest(0,(_col4 - ((_col5 * _col5) / _col3))) / if((_col3 = 1L), null, (_col3 - 1))), 0.5)) (type: double), (power((greatest(0,(_col4 - ((_col5 * _col5) / _col3))) / if((_col3 = 1L), null, (_col3 - 1))), 0.5) * UDFToDouble(_col3)) (type: double), _col6 (type: double), (9763215.5639D / _col1) (type: double), (CAST( _col3 AS decimal(19,0)) / -1.389) (type: decimal(28,6)), power((greatest(0,(_col4 - ((_col5 * _col5) / _col3))) / if((_col3 = 1L), null, (_col3 - 1))), 0.5) (type: double)
147147
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
@@ -150,13 +150,13 @@ STAGE PLANS:
150150
native: true
151151
projectedOutputColumnNums: [0, 1, 2, 7, 9, 3, 18, 28, 39, 6, 40, 42, 51]
152152
selectExpressions: DoubleColSubtractDoubleScalar(col 1:double, val 9763215.5639) -> 7:double, DoubleColUnaryMinus(col 8:double)(children: DoubleColSubtractDoubleScalar(col 1:double, val 9763215.5639) -> 8:double) -> 9:double, FuncPowerDoubleToDouble(col 17:double)(children: DoubleColDivideLongColumn(col 13:double, col 16:bigint)(children: VectorUDFAdaptor(greatest(0,(_col4 - ((_col5 * _col5) / _col3))))(children: DoubleColSubtractDoubleColumn(col 4:double, col 11:double)(children: DoubleColDivideLongColumn(col 10:double, col 3:bigint)(children: DoubleColMultiplyDoubleColumn(col 5:double, col 5:double) -> 10:double) -> 11:double) -> 12:double) -> 13:double, IfExprNullCondExpr(col 14:boolean, null, col 15:bigint)(children: LongColEqualLongScalar(col 3:bigint, val 1) -> 14:boolean, LongColSubtractLongScalar(col 3:bigint, val 1) -> 15:bigint) -> 16:bigint) -> 17:double) -> 18:double, DoubleColUnaryMinus(col 27:double)(children: FuncPowerDoubleToDouble(col 26:double)(children: DoubleColDivideLongColumn(col 22:double, col 25:bigint)(children: VectorUDFAdaptor(greatest(0,(_col4 - ((_col5 * _col5) / _col3))))(children: DoubleColSubtractDoubleColumn(col 4:double, col 20:double)(children: DoubleColDivideLongColumn(col 19:double, col 3:bigint)(children: DoubleColMultiplyDoubleColumn(col 5:double, col 5:double) -> 19:double) -> 20:double) -> 21:double) -> 22:double, IfExprNullCondExpr(col 23:boolean, null, col 24:bigint)(children: LongColEqualLongScalar(col 3:bigint, val 1) -> 23:boolean, LongColSubtractLongScalar(col 3:bigint, val 1) -> 24:bigint) -> 25:bigint) -> 26:double) -> 27:double) -> 28:double, DoubleColMultiplyDoubleColumn(col 37:double, col 38:double)(children: FuncPowerDoubleToDouble(col 36:double)(children: DoubleColDivideLongColumn(col 32:double, col 35:bigint)(children: VectorUDFAdaptor(greatest(0,(_col4 - ((_col5 * _col5) / _col3))))(children: DoubleColSubtractDoubleColumn(col 4:double, col 30:double)(children: DoubleColDivideLongColumn(col 29:double, col 3:bigint)(children: DoubleColMultiplyDoubleColumn(col 5:double, col 5:double) -> 29:double) -> 30:double) -> 31:double) -> 32:double, IfExprNullCondExpr(col 33:boolean, null, col 34:bigint)(children: LongColEqualLongScalar(col 3:bigint, val 1) -> 33:boolean, LongColSubtractLongScalar(col 3:bigint, val 1) -> 34:bigint) -> 35:bigint) -> 36:double) -> 37:double, CastLongToDouble(col 3:bigint) -> 38:double) -> 39:double, DoubleScalarDivideDoubleColumn(val 9763215.5639, col 1:double) -> 40:double, DecimalColDivideDecimalScalar(col 41:decimal(19,0), val -1.389)(children: CastLongToDecimal(col 3:bigint) -> 41:decimal(19,0)) -> 42:decimal(28,6), FuncPowerDoubleToDouble(col 50:double)(children: DoubleColDivideLongColumn(col 46:double, col 49:bigint)(children: VectorUDFAdaptor(greatest(0,(_col4 - ((_col5 * _col5) / _col3))))(children: DoubleColSubtractDoubleColumn(col 4:double, col 44:double)(children: DoubleColDivideLongColumn(col 43:double, col 3:bigint)(children: DoubleColMultiplyDoubleColumn(col 5:double, col 5:double) -> 43:double) -> 44:double) -> 45:double) -> 46:double, IfExprNullCondExpr(col 47:boolean, null, col 48:bigint)(children: LongColEqualLongScalar(col 3:bigint, val 1) -> 47:boolean, LongColSubtractLongScalar(col 3:bigint, val 1) -> 48:bigint) -> 49:bigint) -> 50:double) -> 51:double
153-
Statistics: Num rows: 5979 Data size: 1734126 Basic stats: COMPLETE Column stats: COMPLETE
153+
Statistics: Num rows: 6144 Data size: 1781952 Basic stats: COMPLETE Column stats: COMPLETE
154154
File Output Operator
155155
compressed: false
156156
File Sink Vectorization:
157157
className: VectorFileSinkOperator
158158
native: false
159-
Statistics: Num rows: 5979 Data size: 1734126 Basic stats: COMPLETE Column stats: COMPLETE
159+
Statistics: Num rows: 6144 Data size: 1781952 Basic stats: COMPLETE Column stats: COMPLETE
160160
table:
161161
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
162162
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat

0 commit comments

Comments
 (0)