Skip to content

Commit dbad14c

Browse files
committed
Add a plug-in for synchronized creating kernels
This plug-in is a workaround for https://software.intel.com/en-us/forums/opencl/topic/760981
1 parent 6dabd91 commit dbad14c

File tree

3 files changed

+63
-32
lines changed

3 files changed

+63
-32
lines changed

OpenCL/src/main/scala/com/thoughtworks/compute/OpenCL.scala

Lines changed: 56 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -784,20 +784,14 @@ object OpenCL {
784784
extends AnyVal
785785
with MonadicCloseable[UnitContinuation] {
786786

787-
private def numberOfKernels: Int = {
788-
val result = Array.ofDim[Int](1)
789-
checkErrorCode(clCreateKernelsInProgram(handle, null, result))
790-
result(0)
791-
}
792-
793787
def deviceIds: Seq[DeviceId[Owner]] = {
794788
val stack = stackPush()
795789
try {
796790
val sizeBuffer = stack.mallocPointer(1)
797791
checkErrorCode(clGetProgramInfo(this.handle, CL_PROGRAM_DEVICES, null: PointerBuffer, sizeBuffer))
798792
val numberOfDeviceIds = sizeBuffer.get(0).toInt / POINTER_SIZE
799793
val programDevicesBuffer = stack.mallocPointer(numberOfDeviceIds)
800-
checkErrorCode(clGetProgramInfo(this.handle, CL_PROGRAM_DEVICES, programDevicesBuffer, sizeBuffer))
794+
checkErrorCode(clGetProgramInfo(this.handle, CL_PROGRAM_DEVICES, programDevicesBuffer, null: PointerBuffer))
801795
(0 until numberOfDeviceIds).map { i =>
802796
DeviceId[Owner](programDevicesBuffer.get(i))
803797
}
@@ -806,27 +800,16 @@ object OpenCL {
806800
}
807801
}
808802

809-
def createKernels(): Seq[Kernel[Owner]] = {
810-
(0 until createKernelBuffer().capacity).map { i =>
811-
Kernel[Owner](createKernelBuffer().get(i))
812-
}
813-
}
814-
815-
private def createKernelBuffer(): PointerBuffer = {
816-
val kernelBuffer = BufferUtils.createPointerBuffer(numberOfKernels)
817-
checkErrorCode(clCreateKernelsInProgram(handle, kernelBuffer, null: IntBuffer))
818-
kernelBuffer
803+
def createKernels()(implicit witness: Witness.Aux[Owner]): Seq[Kernel[Owner]] = {
804+
witness.value.createKernels(this.asInstanceOf[witness.value.Program]).asInstanceOf[Seq[Kernel[Owner]]]
819805
}
820806

821-
def createFirstKernel(): Kernel[Owner] = {
822-
val stack = stackPush()
823-
try {
824-
val kernelBuffer = stack.mallocPointer(1)
825-
checkErrorCode(clCreateKernelsInProgram(handle, kernelBuffer, null: IntBuffer))
826-
Kernel(kernelBuffer.get(0))
827-
} finally {
828-
stack.close()
829-
}
807+
/** Creates single kernel from this [[Program]].
808+
*
809+
* @throws InvalidValue if the this [[Program]] has more than one kernel.
810+
*/
811+
def createKernel()(implicit witness: Witness.Aux[Owner]): Kernel[Owner] = {
812+
witness.value.createKernel(this.asInstanceOf[witness.value.Program]).asInstanceOf[Kernel[Owner]]
830813
}
831814

832815
private def buildLogs(deviceIds: Seq[DeviceId[Owner]]): Map[DeviceId[Owner], String] = {
@@ -1044,11 +1027,58 @@ object OpenCL {
10441027
}
10451028
}
10461029

1030+
/** Make the calls to [[createKernels]] and [[createKernel]] synchronized.
1031+
*
1032+
* @note If you are using Intel OpenCL SDK, you will need this plug-in as a workaround
1033+
* @see [[https://software.intel.com/en-us/forums/opencl/topic/760981
1034+
* Bug report: clCreateKernelsInProgram is not thread-safe]]
1035+
*/
1036+
trait SynchronizedCreatingKernel extends OpenCL {
1037+
override protected def createKernels(program: Program): Seq[Kernel] = synchronized {
1038+
super.createKernels(program)
1039+
}
1040+
1041+
override protected def createKernel(program: Program): Kernel = synchronized {
1042+
super.createKernel(program)
1043+
}
1044+
}
1045+
10471046
}
10481047

10491048
trait OpenCL extends MonadicCloseable[UnitContinuation] with ImplicitsSingleton with DefaultCloseable {
10501049
import OpenCL._
10511050

1051+
protected def createKernels(program: Program): Seq[Kernel] = {
1052+
val stack = stackPush()
1053+
try {
1054+
val numberOfKernelsBuffer = stack.mallocInt(1)
1055+
checkErrorCode(clCreateKernelsInProgram(program.handle, null, numberOfKernelsBuffer))
1056+
val numberOfKernels = numberOfKernelsBuffer.get(0)
1057+
val kernelBuffer = stack.mallocPointer(numberOfKernels)
1058+
checkErrorCode(clCreateKernelsInProgram(program.handle, kernelBuffer, null: IntBuffer))
1059+
(0 until kernelBuffer.capacity).map { i =>
1060+
new Kernel(kernelBuffer.get(i))
1061+
}
1062+
} finally {
1063+
stack.close()
1064+
}
1065+
}
1066+
1067+
/** Creates single kernel from this [[Program]].
1068+
*
1069+
* @throws InvalidValue if the this [[Program]] has more than one kernel.
1070+
*/
1071+
protected def createKernel(program: Program): Kernel = {
1072+
val stack = stackPush()
1073+
try {
1074+
val kernelBuffer = stack.mallocPointer(1)
1075+
checkErrorCode(clCreateKernelsInProgram(program.handle, kernelBuffer, null: IntBuffer))
1076+
new Kernel(kernelBuffer.get(0))
1077+
} finally {
1078+
stack.close()
1079+
}
1080+
}
1081+
10521082
protected val logger: Logger
10531083

10541084
type Program = OpenCL.Program[this.type]

Tensors/src/main/scala/com/thoughtworks/compute/Tensors.scala

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ trait Tensors extends OpenCL {
461461
val doBuffer: Do[PendingBuffer[Float]] = {
462462
val size = shape.product
463463
allocateBuffer[Float](size).flatMap { buffer =>
464-
Do.monadicCloseable(randomProgram.createFirstKernel()).intransitiveFlatMap { kernel =>
464+
Do.monadicCloseable(randomProgram.createKernel()).intransitiveFlatMap { kernel =>
465465
kernel(0) = buffer
466466
kernel(1) = seed
467467
dispatch(kernel.enqueue(_, globalWorkSize = Array(size.toLong))).map(EventBuffer[Float](buffer, _))
@@ -487,7 +487,7 @@ trait Tensors extends OpenCL {
487487
size
488488
}
489489
allocateBuffer[Float](paddingSize).flatMap { buffer =>
490-
Do.monadicCloseable(randomNormalProgram.createFirstKernel()).intransitiveFlatMap { kernel =>
490+
Do.monadicCloseable(randomNormalProgram.createKernel()).intransitiveFlatMap { kernel =>
491491
kernel(0) = buffer
492492
kernel(1) = seed
493493
val globalWorkSize = Array((paddingSize / 2).toLong)
@@ -578,7 +578,7 @@ trait Tensors extends OpenCL {
578578
dispatch { commandQueue =>
579579
commandQueue.deviceId.deviceType match {
580580
case CL_DEVICE_TYPE_CPU =>
581-
Do.monadicCloseable(programs.sequentialReductionProgram.createFirstKernel()).intransitiveFlatMap {
581+
Do.monadicCloseable(programs.sequentialReductionProgram.createKernel()).intransitiveFlatMap {
582582
kernel1: Kernel =>
583583
kernel1(0) = inputPendingBuffer.buffer
584584
kernel1(1) = length
@@ -592,7 +592,7 @@ trait Tensors extends OpenCL {
592592
)
593593
}
594594
case _ =>
595-
Do.monadicCloseable(programs.parallelReductionProgram.createFirstKernel()).intransitiveFlatMap {
595+
Do.monadicCloseable(programs.parallelReductionProgram.createKernel()).intransitiveFlatMap {
596596
kernel1: Kernel =>
597597
val stage1LocalWorkSize: Long = math.min(length, kernel1.workGroupSize(commandQueue.deviceId))
598598
val maxNumberOfReductionGroups = commandQueue.deviceId.maxComputeUnits
@@ -627,7 +627,7 @@ trait Tensors extends OpenCL {
627627
waitingEvents = inputPendingBuffer.eventOption.map(_.handle).toSeq
628628
)
629629
.intransitiveFlatMap { scratchEvent: Event =>
630-
Do.monadicCloseable(programs.parallelReductionProgram.createFirstKernel())
630+
Do.monadicCloseable(programs.parallelReductionProgram.createKernel())
631631
.intransitiveFlatMap { kernel2: Kernel =>
632632
// FIXME: An exception thrown here will not be handled. Need further investigation.
633633

@@ -1005,7 +1005,7 @@ trait Tensors extends OpenCL {
10051005
}
10061006
.unwrap
10071007
.intransitiveFlatMap { arguments: List[PendingBuffer[_]] =>
1008-
Do.monadicCloseable(program.createFirstKernel()).intransitiveFlatMap { kernel: Kernel =>
1008+
Do.monadicCloseable(program.createKernel()).intransitiveFlatMap { kernel: Kernel =>
10091009
val valueType = convertedTerm.valueType.asInstanceOf[ValueType]
10101010
val memory = valueType.memory.asInstanceOf[Memory[convertedTerm.JvmValue]]
10111011
allocateBuffer[convertedTerm.JvmValue](shape.product)(memory).flatMap { outputBuffer =>

benchmarks/src/jmh/scala/com/thoughtworks/compute/benchmarks.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ object benchmarks {
3636
with OpenCL.GlobalExecutionContext
3737
with OpenCL.CommandQueuePool
3838
with OpenCL.DontReleaseEventTooEarly
39+
with OpenCL.SynchronizedCreatingKernel
3940
with Tensors.WangHashingRandomNumberGenerator {
4041
@transient
4142
protected lazy val (platformId: PlatformId, deviceIds: Seq[DeviceId]) = {

0 commit comments

Comments
 (0)