ifilonenko
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
Lines changed: 7 additions & 8 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
Lines changed: 7 additions & 8 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
Lines changed: 17 additions & 17 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
Lines changed: 17 additions & 17 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
Lines changed: 28 additions & 2 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
Lines changed: 28 additions & 2 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
Lines changed: 7 additions & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
Lines changed: 7 additions & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
Lines changed: 40 additions & 3 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
Lines changed: 40 additions & 3 deletions
@@ -60,7 +60,7 @@ case class HashAggregateExec(
     "peakMemory" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory"),
     "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"),
     "aggTime" -> SQLMetrics.createTimingMetric(sparkContext, "aggregate time"),
-    "avgHashmapProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hashmap probe"))
+    "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))
 
   override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
 
@@ -94,7 +94,7 @@ case class HashAggregateExec(
     val numOutputRows = longMetric("numOutputRows")
     val peakMemory = longMetric("peakMemory")
     val spillSize = longMetric("spillSize")
-    val avgHashmapProbe = longMetric("avgHashmapProbe")
+    val avgHashProbe = longMetric("avgHashProbe")
 
     child.execute().mapPartitions { iter =>
 
@@ -119,7 +119,7 @@ case class HashAggregateExec(
             numOutputRows,
             peakMemory,
             spillSize,
-            avgHashmapProbe)
+            avgHashProbe)
         if (!hasInput && groupingExpressions.isEmpty) {
           numOutputRows += 1
           Iterator.single[UnsafeRow](aggregationIterator.outputForEmptyGroupingKeyWithoutInput())
@@ -344,7 +344,7 @@ case class HashAggregateExec(
       sorter: UnsafeKVExternalSorter,
       peakMemory: SQLMetric,
       spillSize: SQLMetric,
-      avgHashmapProbe: SQLMetric): KVIterator[UnsafeRow, UnsafeRow] = {
+      avgHashProbe: SQLMetric): KVIterator[UnsafeRow, UnsafeRow] = {
 
     // update peak execution memory
     val mapMemory = hashMap.getPeakMemoryUsedBytes
@@ -355,8 +355,7 @@ case class HashAggregateExec(
     metrics.incPeakExecutionMemory(maxMemory)
 
     // Update average hashmap probe
-    val avgProbes = hashMap.getAverageProbesPerLookup()
-    avgHashmapProbe.add(avgProbes.ceil.toLong)
+    avgHashProbe.set(hashMap.getAverageProbesPerLookup())
 
     if (sorter == null) {
       // not spilled
@@ -584,7 +583,7 @@ case class HashAggregateExec(
     val doAgg = ctx.freshName("doAggregateWithKeys")
     val peakMemory = metricTerm(ctx, "peakMemory")
     val spillSize = metricTerm(ctx, "spillSize")
-    val avgHashmapProbe = metricTerm(ctx, "avgHashmapProbe")
+    val avgHashProbe = metricTerm(ctx, "avgHashProbe")
 
     def generateGenerateCode(): String = {
       if (isFastHashMapEnabled) {
@@ -611,7 +610,7 @@ case class HashAggregateExec(
               s"$iterTermForFastHashMap = $fastHashMapTerm.rowIterator();"} else ""}
 
           $iterTerm = $thisPlan.finishAggregate($hashMapTerm, $sorterTerm, $peakMemory, $spillSize,
-            $avgHashmapProbe);
+            $avgHashProbe);
         }
        """)
 
 
@@ -89,7 +89,7 @@ class TungstenAggregationIterator(
     numOutputRows: SQLMetric,
     peakMemory: SQLMetric,
     spillSize: SQLMetric,
-    avgHashmapProbe: SQLMetric)
+    avgHashProbe: SQLMetric)
   extends AggregationIterator(
     groupingExpressions,
     originalInputAttributes,
@@ -367,6 +367,22 @@ class TungstenAggregationIterator(
     }
   }
 
+  TaskContext.get().addTaskCompletionListener(_ => {
+    // At the end of the task, update the task's peak memory usage. Since we destroy
+    // the map to create the sorter, their memory usages should not overlap, so it is safe
+    // to just use the max of the two.
+    val mapMemory = hashMap.getPeakMemoryUsedBytes
+    val sorterMemory = Option(externalSorter).map(_.getPeakMemoryUsedBytes).getOrElse(0L)
+    val maxMemory = Math.max(mapMemory, sorterMemory)
+    val metrics = TaskContext.get().taskMetrics()
+    peakMemory.set(maxMemory)
+    spillSize.set(metrics.memoryBytesSpilled - spillSizeBefore)
+    metrics.incPeakExecutionMemory(maxMemory)
+
+    // Updating average hashmap probe
+    avgHashProbe.set(hashMap.getAverageProbesPerLookup())
+  })
+
   ///////////////////////////////////////////////////////////////////////////
   // Part 7: Iterator's public methods.
   ///////////////////////////////////////////////////////////////////////////
@@ -409,22 +425,6 @@ class TungstenAggregationIterator(
         }
       }
 
-      // If this is the last record, update the task's peak memory usage. Since we destroy
-      // the map to create the sorter, their memory usages should not overlap, so it is safe
-      // to just use the max of the two.
-      if (!hasNext) {
-        val mapMemory = hashMap.getPeakMemoryUsedBytes
-        val sorterMemory = Option(externalSorter).map(_.getPeakMemoryUsedBytes).getOrElse(0L)
-        val maxMemory = Math.max(mapMemory, sorterMemory)
-        val metrics = TaskContext.get().taskMetrics()
-        peakMemory += maxMemory
-        spillSize += metrics.memoryBytesSpilled - spillSizeBefore
-        metrics.incPeakExecutionMemory(maxMemory)
-
-        // Update average hashmap probe if this is the last record.
-        val averageProbes = hashMap.getAverageProbesPerLookup()
-        avgHashmapProbe.add(averageProbes.ceil.toLong)
-      }
       numOutputRows += 1
       res
     } else {
 
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{BroadcastDistribution, Dist
 import org.apache.spark.sql.execution.{BinaryExecNode, CodegenSupport, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.LongType
+import org.apache.spark.util.TaskCompletionListener
 
 /**
  * Performs an inner hash join of two child relations.  When the output RDD of this operator is
@@ -46,7 +47,8 @@ case class BroadcastHashJoinExec(
   extends BinaryExecNode with HashJoin with CodegenSupport {
 
   override lazy val metrics = Map(
-    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
+    "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))
 
   override def requiredChildDistribution: Seq[Distribution] = {
     val mode = HashedRelationBroadcastMode(buildKeys)
@@ -60,12 +62,13 @@ case class BroadcastHashJoinExec(
 
   protected override def doExecute(): RDD[InternalRow] = {
     val numOutputRows = longMetric("numOutputRows")
+    val avgHashProbe = longMetric("avgHashProbe")
 
     val broadcastRelation = buildPlan.executeBroadcast[HashedRelation]()
     streamedPlan.execute().mapPartitions { streamedIter =>
       val hashed = broadcastRelation.value.asReadOnlyCopy()
       TaskContext.get().taskMetrics().incPeakExecutionMemory(hashed.estimatedSize)
-      join(streamedIter, hashed, numOutputRows)
+      join(streamedIter, hashed, numOutputRows, avgHashProbe)
     }
   }
 
@@ -90,6 +93,23 @@ case class BroadcastHashJoinExec(
     }
   }
 
+  /**
+   * Returns the codes used to add a task completion listener to update avg hash probe
+   * at the end of the task.
+   */
+  private def genTaskListener(avgHashProbe: String, relationTerm: String): String = {
+    val listenerClass = classOf[TaskCompletionListener].getName
+    val taskContextClass = classOf[TaskContext].getName
+    s"""
+       | $taskContextClass$$.MODULE$$.get().addTaskCompletionListener(new $listenerClass() {
+       |   @Override
+       |   public void onTaskCompletion($taskContextClass context) {
+       |     $avgHashProbe.set($relationTerm.getAverageProbesPerLookup());
+       |   }
+       | });
+     """.stripMargin
+  }
+
   /**
    * Returns a tuple of Broadcast of HashedRelation and the variable name for it.
    */
@@ -99,10 +119,16 @@ case class BroadcastHashJoinExec(
     val broadcast = ctx.addReferenceObj("broadcast", broadcastRelation)
     val relationTerm = ctx.freshName("relation")
     val clsName = broadcastRelation.value.getClass.getName
+
+    // At the end of the task, we update the avg hash probe.
+    val avgHashProbe = metricTerm(ctx, "avgHashProbe")
+    val addTaskListener = genTaskListener(avgHashProbe, relationTerm)
+
     ctx.addMutableState(clsName, relationTerm,
       s"""
          | $relationTerm = (($clsName) $broadcast.value()).asReadOnlyCopy();
          | incPeakExecutionMemory($relationTerm.estimatedSize());
+         | $addTaskListener
        """.stripMargin)
     (broadcastRelation, relationTerm)
   }
 
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.joins
 
+import org.apache.spark.TaskContext
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
@@ -193,7 +194,8 @@ trait HashJoin {
   protected def join(
       streamedIter: Iterator[InternalRow],
       hashed: HashedRelation,
-      numOutputRows: SQLMetric): Iterator[InternalRow] = {
+      numOutputRows: SQLMetric,
+      avgHashProbe: SQLMetric): Iterator[InternalRow] = {
 
     val joinedIter = joinType match {
       case _: InnerLike =>
@@ -211,6 +213,10 @@ trait HashJoin {
           s"BroadcastHashJoin should not take $x as the JoinType")
     }
 
+    // At the end of the task, we update the avg hash probe.
+    TaskContext.get().addTaskCompletionListener(_ =>
+      avgHashProbe.set(hashed.getAverageProbesPerLookup()))
+
     val resultProj = createResultProjection
     joinedIter.map { r =>
       numOutputRows += 1
 
@@ -79,6 +79,11 @@ private[execution] sealed trait HashedRelation extends KnownSizeEstimation {
    * Release any used resources.
    */
   def close(): Unit
+
+  /**
+   * Returns the average number of probes per key lookup.
+   */
+  def getAverageProbesPerLookup(): Double
 }
 
 private[execution] object HashedRelation {
@@ -242,7 +247,8 @@ private[joins] class UnsafeHashedRelation(
     binaryMap = new BytesToBytesMap(
       taskMemoryManager,
       (nKeys * 1.5 + 1).toInt, // reduce hash collision
-      pageSizeBytes)
+      pageSizeBytes,
+      true)
 
     var i = 0
     var keyBuffer = new Array[Byte](1024)
@@ -273,6 +279,8 @@ private[joins] class UnsafeHashedRelation(
   override def read(kryo: Kryo, in: Input): Unit = Utils.tryOrIOException {
     read(in.readInt, in.readLong, in.readBytes)
   }
+
+  override def getAverageProbesPerLookup(): Double = binaryMap.getAverageProbesPerLookup()
 }
 
 private[joins] object UnsafeHashedRelation {
@@ -290,7 +298,8 @@ private[joins] object UnsafeHashedRelation {
       taskMemoryManager,
       // Only 70% of the slots can be used before growing, more capacity help to reduce collision
       (sizeEstimate * 1.5 + 1).toInt,
-      pageSizeBytes)
+      pageSizeBytes,
+      true)
 
     // Create a mapping of buildKeys -> rows
     val keyGenerator = UnsafeProjection.create(key)
@@ -344,7 +353,7 @@ private[joins] object UnsafeHashedRelation {
  *  determined by `key1 - minKey`.
  *
  * The map is created as sparse mode, then key-value could be appended into it. Once finish
- * appending, caller could all optimize() to try to turn the map into dense mode, which is faster
+ * appending, caller could call optimize() to try to turn the map into dense mode, which is faster
  * to probe.
  *
  * see http://java-performance.info/implementing-world-fastest-java-int-to-int-hash-map/
@@ -385,6 +394,10 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
   // The number of unique keys.
   private var numKeys = 0L
 
+  // Tracking average number of probes per key lookup.
+  private var numKeyLookups = 0L
+  private var numProbes = 0L
+
   // needed by serializer
   def this() = {
     this(
@@ -469,6 +482,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
    */
   def getValue(key: Long, resultRow: UnsafeRow): UnsafeRow = {
     if (isDense) {
+      numKeyLookups += 1
+      numProbes += 1
       if (key >= minKey && key <= maxKey) {
         val value = array((key - minKey).toInt)
         if (value > 0) {
@@ -477,11 +492,14 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
       }
     } else {
       var pos = firstSlot(key)
+      numKeyLookups += 1
+      numProbes += 1
       while (array(pos + 1) != 0) {
         if (array(pos) == key) {
           return getRow(array(pos + 1), resultRow)
         }
         pos = nextSlot(pos)
+        numProbes += 1
       }
     }
     null
@@ -509,6 +527,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
    */
   def get(key: Long, resultRow: UnsafeRow): Iterator[UnsafeRow] = {
     if (isDense) {
+      numKeyLookups += 1
+      numProbes += 1
       if (key >= minKey && key <= maxKey) {
         val value = array((key - minKey).toInt)
         if (value > 0) {
@@ -517,11 +537,14 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
       }
     } else {
       var pos = firstSlot(key)
+      numKeyLookups += 1
+      numProbes += 1
       while (array(pos + 1) != 0) {
         if (array(pos) == key) {
           return valueIter(array(pos + 1), resultRow)
         }
         pos = nextSlot(pos)
+        numProbes += 1
       }
     }
     null
@@ -573,8 +596,11 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
   private def updateIndex(key: Long, address: Long): Unit = {
     var pos = firstSlot(key)
     assert(numKeys < array.length / 2)
+    numKeyLookups += 1
+    numProbes += 1
     while (array(pos) != key && array(pos + 1) != 0) {
       pos = nextSlot(pos)
+      numProbes += 1
     }
     if (array(pos + 1) == 0) {
       // this is the first value for this key, put the address in array.
@@ -686,6 +712,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
     writeLong(maxKey)
     writeLong(numKeys)
     writeLong(numValues)
+    writeLong(numKeyLookups)
+    writeLong(numProbes)
 
     writeLong(array.length)
     writeLongArray(writeBuffer, array, array.length)
@@ -727,6 +755,8 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
     maxKey = readLong()
     numKeys = readLong()
     numValues = readLong()
+    numKeyLookups = readLong()
+    numProbes = readLong()
 
     val length = readLong().toInt
     mask = length - 2
@@ -742,6 +772,11 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
   override def read(kryo: Kryo, in: Input): Unit = {
     read(in.readBoolean, in.readLong, in.readBytes)
   }
+
+  /**
+   * Returns the average number of probes per key lookup.
+   */
+  def getAverageProbesPerLookup(): Double = numProbes.toDouble / numKeyLookups
 }
 
 private[joins] class LongHashedRelation(
@@ -793,6 +828,8 @@ private[joins] class LongHashedRelation(
     resultRow = new UnsafeRow(nFields)
     map = in.readObject().asInstanceOf[LongToUnsafeRowMap]
   }
+
+  override def getAverageProbesPerLookup(): Double = map.getAverageProbesPerLookup()
 }
 
 /**