Skip to content

Commit 633951c

Browse files
committed
HIVE-29368: more conservative NDV combining by PessimisticStatCombiner
1 parent ee7138b commit 633951c

File tree

5 files changed

+748
-14
lines changed

5 files changed

+748
-14
lines changed

ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,7 @@ public void add(ColStatistics stat) {
4141
if (stat.getAvgColLen() > result.getAvgColLen()) {
4242
result.setAvgColLen(stat.getAvgColLen());
4343
}
44-
if (stat.getCountDistint() > result.getCountDistint()) {
45-
result.setCountDistint(stat.getCountDistint());
46-
}
44+
result.setCountDistint(0L);
4745
if (stat.getNumNulls() > result.getNumNulls()) {
4846
result.setNumNulls(stat.getNumNulls());
4947
}
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.hadoop.hive.ql.stats.estimator;
20+
21+
import static org.junit.jupiter.api.Assertions.assertEquals;
22+
import static org.junit.jupiter.api.Assertions.assertNull;
23+
import static org.junit.jupiter.api.Assertions.assertTrue;
24+
25+
import java.util.Optional;
26+
27+
import org.apache.hadoop.hive.ql.plan.ColStatistics;
28+
import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
29+
import org.junit.jupiter.api.Test;
30+
31+
class TestPessimisticStatCombiner {
32+
33+
@Test
34+
void testSingleStatPreservesNdv() {
35+
ColStatistics stat = createStat("col1", "int", 100, 10, 5.0);
36+
stat.setRange(new Range(0, 100));
37+
38+
PessimisticStatCombiner combiner = new PessimisticStatCombiner();
39+
combiner.add(stat);
40+
41+
Optional<ColStatistics> result = combiner.getResult();
42+
assertTrue(result.isPresent());
43+
ColStatistics combined = result.get();
44+
45+
assertEquals("col1", combined.getColumnName());
46+
assertEquals("int", combined.getColumnType());
47+
assertEquals(100, combined.getCountDistint());
48+
assertEquals(10, combined.getNumNulls());
49+
assertEquals(5.0, combined.getAvgColLen());
50+
assertNull(combined.getRange());
51+
assertTrue(combined.isEstimated());
52+
}
53+
54+
@Test
55+
void testCombineTakesMaxOfAvgColLen() {
56+
ColStatistics stat1 = createStat("col1", "string", 50, 5, 10.0);
57+
ColStatistics stat2 = createStat("col2", "string", 30, 3, 20.0);
58+
59+
PessimisticStatCombiner combiner = new PessimisticStatCombiner();
60+
combiner.add(stat1);
61+
combiner.add(stat2);
62+
63+
ColStatistics combined = combiner.getResult().get();
64+
assertEquals(20.0, combined.getAvgColLen());
65+
}
66+
67+
@Test
68+
void testCombineTakesMaxOfNumNulls() {
69+
ColStatistics stat1 = createStat("col1", "int", 50, 100, 4.0);
70+
ColStatistics stat2 = createStat("col2", "int", 30, 200, 4.0);
71+
72+
PessimisticStatCombiner combiner = new PessimisticStatCombiner();
73+
combiner.add(stat1);
74+
combiner.add(stat2);
75+
76+
ColStatistics combined = combiner.getResult().get();
77+
assertEquals(200, combined.getNumNulls());
78+
}
79+
80+
@Test
81+
void testCombineSetsCountDistinctToZero() {
82+
ColStatistics stat1 = createStat("col1", "int", 100, 10, 4.0);
83+
ColStatistics stat2 = createStat("col2", "int", 200, 20, 4.0);
84+
85+
PessimisticStatCombiner combiner = new PessimisticStatCombiner();
86+
combiner.add(stat1);
87+
combiner.add(stat2);
88+
89+
ColStatistics combined = combiner.getResult().get();
90+
assertEquals(0, combined.getCountDistint());
91+
}
92+
93+
@Test
94+
void testCombineTakesMaxOfNumTruesAndNumFalses() {
95+
ColStatistics stat1 = createStat("col1", "boolean", 2, 5, 1.0);
96+
stat1.setNumTrues(100);
97+
stat1.setNumFalses(50);
98+
99+
ColStatistics stat2 = createStat("col2", "boolean", 2, 10, 1.0);
100+
stat2.setNumTrues(50);
101+
stat2.setNumFalses(150);
102+
103+
PessimisticStatCombiner combiner = new PessimisticStatCombiner();
104+
combiner.add(stat1);
105+
combiner.add(stat2);
106+
107+
ColStatistics combined = combiner.getResult().get();
108+
assertEquals(100, combined.getNumTrues());
109+
assertEquals(150, combined.getNumFalses());
110+
}
111+
112+
@Test
113+
void testCombinePropagatesFilteredColumnFlag() {
114+
ColStatistics stat1 = createStat("col1", "int", 50, 5, 4.0);
115+
ColStatistics stat2 = createStat("col2", "int", 30, 3, 4.0);
116+
stat2.setFilterColumn();
117+
118+
PessimisticStatCombiner combiner = new PessimisticStatCombiner();
119+
combiner.add(stat1);
120+
combiner.add(stat2);
121+
122+
ColStatistics combined = combiner.getResult().get();
123+
assertTrue(combined.isFilteredColumn());
124+
}
125+
126+
@Test
127+
void testCombineMultipleStats() {
128+
ColStatistics stat1 = createStat("col1", "bigint", 1000, 50, 8.0);
129+
ColStatistics stat2 = createStat("col2", "bigint", 500, 100, 8.0);
130+
ColStatistics stat3 = createStat("col3", "bigint", 2000, 25, 8.0);
131+
132+
PessimisticStatCombiner combiner = new PessimisticStatCombiner();
133+
combiner.add(stat1);
134+
combiner.add(stat2);
135+
combiner.add(stat3);
136+
137+
ColStatistics combined = combiner.getResult().get();
138+
assertEquals(0, combined.getCountDistint());
139+
assertEquals(100, combined.getNumNulls());
140+
assertEquals(8.0, combined.getAvgColLen());
141+
}
142+
143+
@Test
144+
void testCombineSameColumnTwice() {
145+
ColStatistics stat = createStat("col1", "int", 100, 10, 4.0);
146+
147+
PessimisticStatCombiner combiner = new PessimisticStatCombiner();
148+
combiner.add(stat);
149+
combiner.add(stat);
150+
151+
ColStatistics combined = combiner.getResult().get();
152+
assertEquals(0, combined.getCountDistint());
153+
assertEquals(10, combined.getNumNulls());
154+
assertEquals(4.0, combined.getAvgColLen());
155+
}
156+
157+
private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) {
158+
ColStatistics stat = new ColStatistics(name, type);
159+
stat.setCountDistint(ndv);
160+
stat.setNumNulls(numNulls);
161+
stat.setAvgColLen(avgColLen);
162+
return stat;
163+
}
164+
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
CREATE TABLE t1 (cat INT, val BIGINT, data STRING);
2+
ALTER TABLE t1 UPDATE STATISTICS SET('numRows'='1000000','rawDataSize'='100000000');
3+
ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN cat SET('numDVs'='100','numNulls'='0');
4+
5+
-- Test 1: IF should result in NDV of 2
6+
EXPLAIN
7+
SELECT x, COUNT(*)
8+
FROM (SELECT IF(cat > 50, 'A', 'B') x FROM t1) sub
9+
GROUP BY x;
10+
11+
-- Test 2: CASE WHEN should result in NDV of 3
12+
EXPLAIN
13+
SELECT x, COUNT(*)
14+
FROM (
15+
SELECT CASE WHEN cat < 30 THEN 'X' WHEN cat < 60 THEN 'Y' ELSE 'Z' END x
16+
FROM t1
17+
) sub
18+
GROUP BY x;
19+
20+
-- Test 3: CASE col WHEN val should result in NDV of 4
21+
EXPLAIN
22+
SELECT x, COUNT(*)
23+
FROM (
24+
SELECT CASE cat WHEN 1 THEN 'A' WHEN 2 THEN 'B' WHEN 3 THEN 'C' ELSE 'D' END x
25+
FROM t1
26+
) sub
27+
GROUP BY x;
28+
29+
-- Test 4: MapJoin NO longer chosen due to NDV=1 causing tiny size estimate
30+
CREATE TABLE t2 (key STRING, v1 STRING);
31+
32+
ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN val SET('numDVs'='1000000','numNulls'='0');
33+
ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN data SET('numDVs'='5000000','numNulls'='0','avgColLen'='500.0','maxColLen'='600');
34+
ALTER TABLE t2 UPDATE STATISTICS SET('numRows'='1000000','rawDataSize'='100000000');
35+
ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN key SET('numDVs'='1000000','numNulls'='0','avgColLen'='50.0','maxColLen'='100');
36+
ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN v1 SET('numDVs'='1000000','numNulls'='0','avgColLen'='50.0','maxColLen'='100');
37+
SET hive.auto.convert.join=true;
38+
SET hive.auto.convert.join.noconditionaltask.size=1000;
39+
40+
EXPLAIN
41+
SELECT a.k, a.total, a.sample, b.v1
42+
FROM (
43+
SELECT
44+
k,
45+
SUM(val) as total,
46+
MAX(data) as sample
47+
FROM (
48+
SELECT
49+
CASE
50+
WHEN cat BETWEEN 0 AND 4 THEN 'K00'
51+
WHEN cat BETWEEN 5 AND 9 THEN 'K01'
52+
WHEN cat BETWEEN 10 AND 14 THEN 'K02'
53+
WHEN cat BETWEEN 15 AND 19 THEN 'K03'
54+
WHEN cat BETWEEN 20 AND 24 THEN 'K04'
55+
WHEN cat BETWEEN 25 AND 29 THEN 'K05'
56+
WHEN cat BETWEEN 30 AND 34 THEN 'K06'
57+
WHEN cat BETWEEN 35 AND 39 THEN 'K07'
58+
WHEN cat BETWEEN 40 AND 44 THEN 'K08'
59+
WHEN cat BETWEEN 45 AND 49 THEN 'K09'
60+
WHEN cat BETWEEN 50 AND 54 THEN 'K10'
61+
WHEN cat BETWEEN 55 AND 59 THEN 'K11'
62+
WHEN cat BETWEEN 60 AND 64 THEN 'K12'
63+
WHEN cat BETWEEN 65 AND 69 THEN 'K13'
64+
WHEN cat BETWEEN 70 AND 74 THEN 'K14'
65+
WHEN cat BETWEEN 75 AND 79 THEN 'K15'
66+
WHEN cat BETWEEN 80 AND 84 THEN 'K16'
67+
WHEN cat BETWEEN 85 AND 89 THEN 'K17'
68+
WHEN cat BETWEEN 90 AND 94 THEN 'K18'
69+
ELSE 'K19'
70+
END as k,
71+
val,
72+
data
73+
FROM t1
74+
) s
75+
GROUP BY k
76+
) a
77+
JOIN t2 b ON a.k = b.key;

0 commit comments

Comments
 (0)