Skip to content
This repository was archived by the owner on Jan 9, 2020. It is now read-only.

Commit a11db94

Browse files
ConeyLiucloud-fan
authored andcommitted
[SPARK-21923][CORE] Avoid calling reserveUnrollMemoryForThisTask for every record
## What changes were proposed in this pull request? When Spark persist data to Unsafe memory, we call the method `MemoryStore.putIteratorAsBytes`, which need synchronize the `memoryManager` for every record write. This implementation is not necessary, we can apply for more memory at a time to reduce unnecessary synchronization. ## How was this patch tested? Test case (with 1 executor 20 core): ```scala val start = System.currentTimeMillis() val data = sc.parallelize(0 until Integer.MAX_VALUE, 100) .persist(StorageLevel.OFF_HEAP) .count() println(System.currentTimeMillis() - start) ``` Test result: before | 27647 | 29108 | 28591 | 28264 | 27232 | after | 26868 | 26358 | 27767 | 26653 | 26693 | Author: Xianyang Liu <[email protected]> Closes apache#19135 from ConeyLiu/memorystore.
1 parent 10f45b3 commit a11db94

File tree

2 files changed

+29
-4
lines changed

2 files changed

+29
-4
lines changed

core/src/main/scala/org/apache/spark/internal/config/package.scala

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,4 +385,19 @@ package object config {
385385
.checkValue(v => v > 0 && v <= Int.MaxValue,
386386
s"The buffer size must be greater than 0 and less than ${Int.MaxValue}.")
387387
.createWithDefault(1024 * 1024)
388+
389+
private[spark] val UNROLL_MEMORY_CHECK_PERIOD =
390+
ConfigBuilder("spark.storage.unrollMemoryCheckPeriod")
391+
.internal()
392+
.doc("The memory check period is used to determine how often we should check whether "
393+
+ "there is a need to request more memory when we try to unroll the given block in memory.")
394+
.longConf
395+
.createWithDefault(16)
396+
397+
private[spark] val UNROLL_MEMORY_GROWTH_FACTOR =
398+
ConfigBuilder("spark.storage.unrollMemoryGrowthFactor")
399+
.internal()
400+
.doc("Memory to request as a multiple of the size that used to unroll the block.")
401+
.doubleConf
402+
.createWithDefault(1.5)
388403
}

core/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import com.google.common.io.ByteStreams
2929

3030
import org.apache.spark.{SparkConf, TaskContext}
3131
import org.apache.spark.internal.Logging
32+
import org.apache.spark.internal.config.{UNROLL_MEMORY_CHECK_PERIOD, UNROLL_MEMORY_GROWTH_FACTOR}
3233
import org.apache.spark.memory.{MemoryManager, MemoryMode}
3334
import org.apache.spark.serializer.{SerializationStream, SerializerManager}
3435
import org.apache.spark.storage.{BlockId, BlockInfoManager, StorageLevel, StreamBlockId}
@@ -190,11 +191,11 @@ private[spark] class MemoryStore(
190191
// Initial per-task memory to request for unrolling blocks (bytes).
191192
val initialMemoryThreshold = unrollMemoryThreshold
192193
// How often to check whether we need to request more memory
193-
val memoryCheckPeriod = 16
194+
val memoryCheckPeriod = conf.get(UNROLL_MEMORY_CHECK_PERIOD)
194195
// Memory currently reserved by this task for this particular unrolling operation
195196
var memoryThreshold = initialMemoryThreshold
196197
// Memory to request as a multiple of current vector size
197-
val memoryGrowthFactor = 1.5
198+
val memoryGrowthFactor = conf.get(UNROLL_MEMORY_GROWTH_FACTOR)
198199
// Keep track of unroll memory used by this particular block / putIterator() operation
199200
var unrollMemoryUsedByThisBlock = 0L
200201
// Underlying vector for unrolling the block
@@ -325,6 +326,12 @@ private[spark] class MemoryStore(
325326

326327
// Whether there is still enough memory for us to continue unrolling this block
327328
var keepUnrolling = true
329+
// Number of elements unrolled so far
330+
var elementsUnrolled = 0L
331+
// How often to check whether we need to request more memory
332+
val memoryCheckPeriod = conf.get(UNROLL_MEMORY_CHECK_PERIOD)
333+
// Memory to request as a multiple of current bbos size
334+
val memoryGrowthFactor = conf.get(UNROLL_MEMORY_GROWTH_FACTOR)
328335
// Initial per-task memory to request for unrolling blocks (bytes).
329336
val initialMemoryThreshold = unrollMemoryThreshold
330337
// Keep track of unroll memory used by this particular block / putIterator() operation
@@ -359,7 +366,7 @@ private[spark] class MemoryStore(
359366

360367
def reserveAdditionalMemoryIfNecessary(): Unit = {
361368
if (bbos.size > unrollMemoryUsedByThisBlock) {
362-
val amountToRequest = bbos.size - unrollMemoryUsedByThisBlock
369+
val amountToRequest = (bbos.size * memoryGrowthFactor - unrollMemoryUsedByThisBlock).toLong
363370
keepUnrolling = reserveUnrollMemoryForThisTask(blockId, amountToRequest, memoryMode)
364371
if (keepUnrolling) {
365372
unrollMemoryUsedByThisBlock += amountToRequest
@@ -370,7 +377,10 @@ private[spark] class MemoryStore(
370377
// Unroll this block safely, checking whether we have exceeded our threshold
371378
while (values.hasNext && keepUnrolling) {
372379
serializationStream.writeObject(values.next())(classTag)
373-
reserveAdditionalMemoryIfNecessary()
380+
elementsUnrolled += 1
381+
if (elementsUnrolled % memoryCheckPeriod == 0) {
382+
reserveAdditionalMemoryIfNecessary()
383+
}
374384
}
375385

376386
// Make sure that we have enough memory to store the block. By this point, it is possible that

0 commit comments

Comments
 (0)