apache
diff --git a/‎ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java‎
Lines changed: 1 addition & 3 deletions b/‎ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java‎
Lines changed: 164 additions & 0 deletions b/‎ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java‎
Lines changed: 164 additions & 0 deletions
diff --git a/‎ql/src/test/queries/clientpositive/pessimistic_stat_combiner_ndv.q‎
Lines changed: 77 additions & 0 deletions b/‎ql/src/test/queries/clientpositive/pessimistic_stat_combiner_ndv.q‎
Lines changed: 77 additions & 0 deletions
@@ -41,9 +41,7 @@ public void add(ColStatistics stat) {
       if (stat.getAvgColLen() > result.getAvgColLen()) {
         result.setAvgColLen(stat.getAvgColLen());
       }
-      if (stat.getCountDistint() > result.getCountDistint()) {
-        result.setCountDistint(stat.getCountDistint());
-      }
+      result.setCountDistint(0L);
       if (stat.getNumNulls() > result.getNumNulls()) {
         result.setNumNulls(stat.getNumNulls());
       }
 
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.stats.estimator;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Optional;
+
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
+import org.junit.jupiter.api.Test;
+
+class TestPessimisticStatCombiner {
+
+  @Test
+  void testSingleStatPreservesNdv() {
+    ColStatistics stat = createStat("col1", "int", 100, 10, 5.0);
+    stat.setRange(new Range(0, 100));
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat);
+
+    Optional<ColStatistics> result = combiner.getResult();
+    assertTrue(result.isPresent());
+    ColStatistics combined = result.get();
+
+    assertEquals("col1", combined.getColumnName());
+    assertEquals("int", combined.getColumnType());
+    assertEquals(100, combined.getCountDistint());
+    assertEquals(10, combined.getNumNulls());
+    assertEquals(5.0, combined.getAvgColLen());
+    assertNull(combined.getRange());
+    assertTrue(combined.isEstimated());
+  }
+
+  @Test
+  void testCombineTakesMaxOfAvgColLen() {
+    ColStatistics stat1 = createStat("col1", "string", 50, 5, 10.0);
+    ColStatistics stat2 = createStat("col2", "string", 30, 3, 20.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(20.0, combined.getAvgColLen());
+  }
+
+  @Test
+  void testCombineTakesMaxOfNumNulls() {
+    ColStatistics stat1 = createStat("col1", "int", 50, 100, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 30, 200, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(200, combined.getNumNulls());
+  }
+
+  @Test
+  void testCombineSetsCountDistinctToZero() {
+    ColStatistics stat1 = createStat("col1", "int", 100, 10, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 200, 20, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(0, combined.getCountDistint());
+  }
+
+  @Test
+  void testCombineTakesMaxOfNumTruesAndNumFalses() {
+    ColStatistics stat1 = createStat("col1", "boolean", 2, 5, 1.0);
+    stat1.setNumTrues(100);
+    stat1.setNumFalses(50);
+
+    ColStatistics stat2 = createStat("col2", "boolean", 2, 10, 1.0);
+    stat2.setNumTrues(50);
+    stat2.setNumFalses(150);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(100, combined.getNumTrues());
+    assertEquals(150, combined.getNumFalses());
+  }
+
+  @Test
+  void testCombinePropagatesFilteredColumnFlag() {
+    ColStatistics stat1 = createStat("col1", "int", 50, 5, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 30, 3, 4.0);
+    stat2.setFilterColumn();
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertTrue(combined.isFilteredColumn());
+  }
+
+  @Test
+  void testCombineMultipleStats() {
+    ColStatistics stat1 = createStat("col1", "bigint", 1000, 50, 8.0);
+    ColStatistics stat2 = createStat("col2", "bigint", 500, 100, 8.0);
+    ColStatistics stat3 = createStat("col3", "bigint", 2000, 25, 8.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+    combiner.add(stat3);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(0, combined.getCountDistint());
+    assertEquals(100, combined.getNumNulls());
+    assertEquals(8.0, combined.getAvgColLen());
+  }
+
+  @Test
+  void testCombineSameColumnTwice() {
+    ColStatistics stat = createStat("col1", "int", 100, 10, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat);
+    combiner.add(stat);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(0, combined.getCountDistint());
+    assertEquals(10, combined.getNumNulls());
+    assertEquals(4.0, combined.getAvgColLen());
+  }
+
+  private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) {
+    ColStatistics stat = new ColStatistics(name, type);
+    stat.setCountDistint(ndv);
+    stat.setNumNulls(numNulls);
+    stat.setAvgColLen(avgColLen);
+    return stat;
+  }
+}
@@ -0,0 +1,77 @@
+CREATE TABLE t1 (cat INT, val BIGINT, data STRING);
+ALTER TABLE t1 UPDATE STATISTICS SET('numRows'='1000000','rawDataSize'='100000000');
+ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN cat SET('numDVs'='100','numNulls'='0');
+
+-- Test 1: IF should result in NDV of 2
+EXPLAIN
+SELECT x, COUNT(*)
+FROM (SELECT IF(cat > 50, 'A', 'B') x FROM t1) sub
+GROUP BY x;
+
+-- Test 2: CASE WHEN should result in NDV of 3
+EXPLAIN
+SELECT x, COUNT(*)
+FROM (
+  SELECT CASE WHEN cat < 30 THEN 'X' WHEN cat < 60 THEN 'Y' ELSE 'Z' END x
+  FROM t1
+) sub
+GROUP BY x;
+
+-- Test 3: CASE col WHEN val should result in NDV of 4
+EXPLAIN
+SELECT x, COUNT(*)
+FROM (
+  SELECT CASE cat WHEN 1 THEN 'A' WHEN 2 THEN 'B' WHEN 3 THEN 'C' ELSE 'D' END x
+  FROM t1
+) sub
+GROUP BY x;
+
+-- Test 4: MapJoin NO longer chosen due to NDV=1 causing tiny size estimate
+CREATE TABLE t2 (key STRING, v1 STRING);
+
+ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN val SET('numDVs'='1000000','numNulls'='0');
+ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN data SET('numDVs'='5000000','numNulls'='0','avgColLen'='500.0','maxColLen'='600');
+ALTER TABLE t2 UPDATE STATISTICS SET('numRows'='1000000','rawDataSize'='100000000');
+ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN key SET('numDVs'='1000000','numNulls'='0','avgColLen'='50.0','maxColLen'='100');
+ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN v1 SET('numDVs'='1000000','numNulls'='0','avgColLen'='50.0','maxColLen'='100');
+SET hive.auto.convert.join=true;
+SET hive.auto.convert.join.noconditionaltask.size=1000;
+
+EXPLAIN
+SELECT a.k, a.total, a.sample, b.v1
+FROM (
+  SELECT
+    k,
+    SUM(val) as total,
+    MAX(data) as sample
+  FROM (
+    SELECT
+      CASE
+        WHEN cat BETWEEN 0 AND 4 THEN 'K00'
+        WHEN cat BETWEEN 5 AND 9 THEN 'K01'
+        WHEN cat BETWEEN 10 AND 14 THEN 'K02'
+        WHEN cat BETWEEN 15 AND 19 THEN 'K03'
+        WHEN cat BETWEEN 20 AND 24 THEN 'K04'
+        WHEN cat BETWEEN 25 AND 29 THEN 'K05'
+        WHEN cat BETWEEN 30 AND 34 THEN 'K06'
+        WHEN cat BETWEEN 35 AND 39 THEN 'K07'
+        WHEN cat BETWEEN 40 AND 44 THEN 'K08'
+        WHEN cat BETWEEN 45 AND 49 THEN 'K09'
+        WHEN cat BETWEEN 50 AND 54 THEN 'K10'
+        WHEN cat BETWEEN 55 AND 59 THEN 'K11'
+        WHEN cat BETWEEN 60 AND 64 THEN 'K12'
+        WHEN cat BETWEEN 65 AND 69 THEN 'K13'
+        WHEN cat BETWEEN 70 AND 74 THEN 'K14'
+        WHEN cat BETWEEN 75 AND 79 THEN 'K15'
+        WHEN cat BETWEEN 80 AND 84 THEN 'K16'
+        WHEN cat BETWEEN 85 AND 89 THEN 'K17'
+        WHEN cat BETWEEN 90 AND 94 THEN 'K18'
+        ELSE 'K19'
+      END as k,
+      val,
+      data
+    FROM t1
+  ) s
+  GROUP BY k
+) a
+JOIN t2 b ON a.k = b.key;
Original file line number	Diff line number	Diff line change
`@@ -41,9 +41,7 @@ public void add(ColStatistics stat) {`
`41`	`41`	`if (stat.getAvgColLen() > result.getAvgColLen()) {`
`42`	`42`	`result.setAvgColLen(stat.getAvgColLen());`
`43`	`43`	`}`
`44`		`- if (stat.getCountDistint() > result.getCountDistint()) {`
`45`		`- result.setCountDistint(stat.getCountDistint());`
`46`		`- }`
	`44`	`+ result.setCountDistint(0L);`
`47`	`45`	`if (stat.getNumNulls() > result.getNumNulls()) {`
`48`	`46`	`result.setNumNulls(stat.getNumNulls());`
`49`	`47`	`}`