apache · konstantinb · Dec 16, 2025 · Dec 17, 2025 · Dec 18, 2025 · deniskuzZ
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java
@@ -280,20 +280,20 @@ private boolean selectJoinForLlap(OptimizeTezProcContext context, JoinOperator j
         continue;
       }
       Operator<? extends OperatorDesc> parentOp = joinOp.getParentOperators().get(pos);
-      totalSize += computeOnlineDataSize(parentOp.getStatistics());
+      totalSize = StatsUtils.safeAdd(totalSize, computeOnlineDataSize(parentOp.getStatistics()));
     }
 
     // Size of bigtable
     long bigTableSize = computeOnlineDataSize(joinOp.getParentOperators().get(mapJoinConversionPos).getStatistics());
 
     // Network cost of DPHJ
-    long networkCostDPHJ = totalSize + bigTableSize;
+    long networkCostDPHJ = StatsUtils.safeAdd(totalSize, bigTableSize);
 
     LOG.info("Cost of dynamically partitioned hash join : total small table size = " + totalSize
     + " bigTableSize = " + bigTableSize + "networkCostDPHJ = " + networkCostDPHJ);
 
     // Network cost of map side join
-    long networkCostMJ = numNodes * totalSize;
+    long networkCostMJ = StatsUtils.safeMult(numNodes, totalSize);
     LOG.info("Cost of Bucket Map Join : numNodes = " + numNodes + " total small table size = "
     + totalSize + " networkCostMJ = " + networkCostMJ);
 
@@ -363,9 +363,13 @@ public long computeOnlineDataSizeGeneric(Statistics statistics, long overHeadPer
       numRows = 1;
     }
     long worstCaseNeededSlots = 1L << DoubleMath.log2(numRows / hashTableLoadFactor, RoundingMode.UP);
-    onlineDataSize += statistics.getDataSize() - hashTableDataSizeAdjustment(numRows, statistics.getColumnStats());
-    onlineDataSize += overHeadPerRow * statistics.getNumRows();
-    onlineDataSize += overHeadPerSlot * worstCaseNeededSlots;
+    long adjustedDataSize = Math.max(0L,
+        statistics.getDataSize() - hashTableDataSizeAdjustment(numRows, statistics.getColumnStats()));
+    onlineDataSize = StatsUtils.safeAdd(onlineDataSize, adjustedDataSize);
+    onlineDataSize = StatsUtils.safeAdd(onlineDataSize,
+        StatsUtils.safeMult(overHeadPerRow, statistics.getNumRows()));
+    onlineDataSize = StatsUtils.safeAdd(onlineDataSize,
+        StatsUtils.safeMult(overHeadPerSlot, worstCaseNeededSlots));
     return onlineDataSize;
   }
 
@@ -384,7 +388,7 @@ private static long hashTableDataSizeAdjustment(long numRows, List<ColStatistics
     for (ColStatistics cs : colStats) {
       if (cs != null) {
         String colTypeLowerCase = cs.getColumnType().toLowerCase();
-        long nonNullCount = cs.getNumNulls() > 0 ? numRows - cs.getNumNulls() + 1 : numRows;
+        long nonNullCount = cs.getNumNulls() > 0 ? Math.max(1L, numRows - cs.getNumNulls() + 1) : numRows;
         double overhead = 0;
         if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME)
             || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
@@ -1248,7 +1252,7 @@ public MapJoinConversion getMapJoinConversion(JoinOperator joinOp, OptimizeTezPr
       if (bigInputStat != null && selectedBigTable) {
         // We are replacing the current big table with a new one, thus
         // we need to count the current one as a map table then.
-        totalSize += computeOnlineDataSize(bigInputStat);
+        totalSize = StatsUtils.safeAdd(totalSize, computeOnlineDataSize(bigInputStat));
         // Check if number of distinct keys is greater than given max number of entries
         // for HashMap
         if (checkMapJoinThresholds && !checkNumberOfEntriesForHashTable(joinOp, bigTablePosition, context)) {
@@ -1257,7 +1261,7 @@ public MapJoinConversion getMapJoinConversion(JoinOperator joinOp, OptimizeTezPr
       } else if (!selectedBigTable) {
         // This is not the first table and we are not using it as big table,
         // in fact, we're adding this table as a map table
-        totalSize += inputSize;
+        totalSize = StatsUtils.safeAdd(totalSize, inputSize);
         // Check if number of distinct keys is greater than given max number of entries
         // for HashMap
         if (checkMapJoinThresholds && !checkNumberOfEntriesForHashTable(joinOp, pos, context)) {
@@ -1342,15 +1346,15 @@ private static Long computeCumulativeCardinality(Operator<? extends OperatorDesc
         if (inputCardinality == null) {
           return null;
         }
-        cumulativeCardinality += inputCardinality;
+        cumulativeCardinality = StatsUtils.safeAdd(cumulativeCardinality, inputCardinality);
       }
     }
     Statistics currInputStat = op.getStatistics();
     if (currInputStat == null) {
       LOG.warn("Couldn't get statistics from: " + op);
       return null;
     }
-    cumulativeCardinality += currInputStat.getNumRows();
+    cumulativeCardinality = StatsUtils.safeAdd(cumulativeCardinality, currInputStat.getNumRows());
     return cumulativeCardinality;
   }
 

diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestConvertJoinMapJoin.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/TestConvertJoinMapJoin.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.lang.reflect.Method;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.Statistics;
+import org.junit.jupiter.api.Test;
+
+class TestConvertJoinMapJoin {
+
+  @Test
+  void testComputeOnlineDataSizeGenericLargeDataSize() {
+    ConvertJoinMapJoin converter = new ConvertJoinMapJoin();
+    converter.hashTableLoadFactor = 0.75f;
+    Statistics stats = new Statistics(1000L, Long.MAX_VALUE, 0L, 0L);
+
+    long result = converter.computeOnlineDataSizeGeneric(stats, 10L, 8L);
+
+    assertTrue(result >= 0, "Result should not be negative due to overflow");
+  }
+
+  @Test
+  void testComputeOnlineDataSizeGenericLargeNumRowsWithOverhead() {
+    ConvertJoinMapJoin converter = new ConvertJoinMapJoin();
+    converter.hashTableLoadFactor = 0.75f;
+    Statistics stats = new Statistics(Long.MAX_VALUE / 2, 1000L, 0L, 0L);
+
+    long result = converter.computeOnlineDataSizeGeneric(stats, Long.MAX_VALUE / 4, Long.MAX_VALUE / 4);
+
+    assertTrue(result >= 0, "Result should not be negative due to overflow");
+    assertEquals(Long.MAX_VALUE, result, "Result should saturate at Long.MAX_VALUE");
+  }
+
+  @Test
+  void testComputeOnlineDataSizeGenericNumNullsLargerThanNumRows() {
+    ConvertJoinMapJoin converter = new ConvertJoinMapJoin();
+    converter.hashTableLoadFactor = 0.75f;
+    Statistics stats = new Statistics(100L, 10000L, 0L, 0L);
+    List<ColStatistics> colStats = new ArrayList<>();
+    ColStatistics cs = new ColStatistics("col1", "string");
+    cs.setNumNulls(Long.MAX_VALUE);
+    colStats.add(cs);
+    stats.setColumnStats(colStats);
+
+    long result = converter.computeOnlineDataSizeGeneric(stats, 10L, 8L);
+
+    assertTrue(result >= 0, "Result should not be negative due to underflow in nonNullCount");
+  }
+
+  @Test
+  void testComputeOnlineDataSizeGenericSmallDataSizeLargeAdjustment() {
+    ConvertJoinMapJoin converter = new ConvertJoinMapJoin();
+    converter.hashTableLoadFactor = 0.75f;
+    Statistics stats = new Statistics(1000000L, 100L, 0L, 0L);
+    List<ColStatistics> colStats = new ArrayList<>();
+    ColStatistics cs = new ColStatistics("col1", "string");
+    cs.setNumNulls(0L);
+    colStats.add(cs);
+    stats.setColumnStats(colStats);
+
+    long result = converter.computeOnlineDataSizeGeneric(stats, 10L, 8L);
+
+    assertTrue(result >= 0, "Result should not be negative when adjustment > dataSize");
+  }
+
+  @Test
+  void testComputeOnlineDataSizeGenericAllExtremeValues() {
+    ConvertJoinMapJoin converter = new ConvertJoinMapJoin();
+    converter.hashTableLoadFactor = 0.75f;
+    Statistics stats = new Statistics(Long.MAX_VALUE, Long.MAX_VALUE, 0L, 0L);
+    List<ColStatistics> colStats = new ArrayList<>();
+    ColStatistics cs = new ColStatistics("col1", "string");
+    cs.setNumNulls(Long.MAX_VALUE);
+    colStats.add(cs);
+    stats.setColumnStats(colStats);
+
+    long result = converter.computeOnlineDataSizeGeneric(stats, Long.MAX_VALUE, Long.MAX_VALUE);
+
+    assertTrue(result >= 0, "Result should not be negative with extreme values");
+    assertEquals(Long.MAX_VALUE, result, "Result should saturate at Long.MAX_VALUE");
+  }
+
+  @Test
+  void testComputeCumulativeCardinalityWithParentsOverflow() {
+    Operator<?> parent1 = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+    when(parent1.getParentOperators()).thenReturn(Collections.emptyList());
+    Operator<?> parent2 = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+    when(parent2.getParentOperators()).thenReturn(Collections.emptyList());
+    Operator<?> mockOp = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+    when(mockOp.getParentOperators()).thenReturn(Arrays.asList(parent1, parent2));
+
+    Long result = invokeComputeCumulativeCardinality(mockOp);
+
+    assertNotNull(result, "Result should not be null");
+    assertTrue(result >= 0, "Result should not be negative due to overflow");
+    assertEquals(Long.MAX_VALUE, result.longValue(), "Result should saturate at Long.MAX_VALUE");
+  }
+
+  @Test
+  void testComputeCumulativeCardinalityDeepTreeOverflow() {
+    Operator<?> leaf = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+    when(leaf.getParentOperators()).thenReturn(Collections.emptyList());
+    Operator<?> mid1 = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+    when(mid1.getParentOperators()).thenReturn(Collections.singletonList(leaf));
+    Operator<?> mid2 = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+    when(mid2.getParentOperators()).thenReturn(Collections.singletonList(mid1));
+    Operator<?> root = createMockOperatorWithStats(Long.MAX_VALUE / 2);
+    when(root.getParentOperators()).thenReturn(Collections.singletonList(mid2));
+
+    Long result = invokeComputeCumulativeCardinality(root);
+
+    assertNotNull(result, "Result should not be null");
+    assertTrue(result >= 0, "Result should not be negative due to overflow");
+    assertEquals(Long.MAX_VALUE, result.longValue(), "Result should saturate at Long.MAX_VALUE");
+  }
+
+  @SuppressWarnings("unchecked")
+  private Operator<?> createMockOperatorWithStats(long numRows) {
+    Operator<?> mockOp = mock(Operator.class);
+    Statistics stats = new Statistics(numRows, numRows * 100, 0L, 0L);
+    when(mockOp.getStatistics()).thenReturn(stats);
+    return mockOp;
+  }
+
+  private Long invokeComputeCumulativeCardinality(Operator<?> op) {
+    try {
+      Method method = ConvertJoinMapJoin.class.getDeclaredMethod(
+          "computeCumulativeCardinality", Operator.class);
+      method.setAccessible(true);
+      return (Long) method.invoke(null, op);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+}
diff --git a/ql/src/test/queries/clientpositive/mapjoin_stats_overflow.q b/ql/src/test/queries/clientpositive/mapjoin_stats_overflow.q
@@ -0,0 +1,26 @@
+-- Test overflow handling in computeOnlineDataSize with Long.MAX_VALUE statistics
+
+SET hive.auto.convert.join=true;
+SET hive.auto.convert.join.noconditionaltask=true;
+SET hive.auto.convert.join.noconditionaltask.size=10000000;
+
+CREATE TABLE t1 (k BIGINT, v STRING);
+CREATE TABLE t2 (k BIGINT, v STRING);
+
+-- Case 1: Normal statistics - t1 fits in 10MB threshold, MapJoin expected
+ALTER TABLE t1 UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='100000');
+ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='10000','numNulls'='0');
+ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='10000','numNulls'='0','avgColLen'='10.0','maxColLen'='20');
+
+ALTER TABLE t2 UPDATE STATISTICS SET('numRows'='1000000','rawDataSize'='10000000');
+ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='1000000','numNulls'='0');
+ALTER TABLE t2 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='1000000','numNulls'='0','avgColLen'='10.0','maxColLen'='20');
+
+EXPLAIN SELECT t1.k, t2.v FROM t1 JOIN t2 ON t1.k = t2.k;
+
+-- Case 2: Long.MAX_VALUE numRows - without fix, overflow causes negative size and incorrect MapJoin
+ALTER TABLE t1 UPDATE STATISTICS SET('numRows'='9223372036854775807','rawDataSize'='9223372036854775807');
+ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN k SET('numDVs'='1000','numNulls'='0');
+ALTER TABLE t1 UPDATE STATISTICS FOR COLUMN v SET('numDVs'='1000','numNulls'='0','avgColLen'='10.0','maxColLen'='20');
+
+EXPLAIN SELECT t1.k, t1.v, t2.v FROM t1 JOIN t2 ON t1.k = t2.k;