99
1010use std:: sync:: Arc ;
1111
12- use futures:: executor:: block_on;
1312use vortex:: array:: ArrayRef ;
1413use vortex:: array:: DynArray ;
1514use vortex:: array:: ExecutionCtx ;
@@ -324,13 +323,24 @@ impl PlanBuilderState<'_> {
324323 fn walk_primitive ( & mut self , array : ArrayRef ) -> VortexResult < Pipeline > {
325324 let prim = array. to_canonical ( ) ?. into_primitive ( ) ;
326325 let PrimitiveArrayParts { buffer, .. } = prim. into_parts ( ) ;
327- let device_buf = block_on ( self . ctx . ensure_on_device ( buffer) ) ?;
326+
327+ // TODO(0ax1): Optimize device buffer allocation and copying.
328+ //
329+ // Ideally, there would be a buffer pool of preallocated device memory
330+ // such that retrieving a device pointer is O(1) when building the
331+ // dynamic dispatch plan. In the current setup, we need to allocate the
332+ // buffer before we can get the device pointer. As the memory is
333+ // allocated via the global allocator, which does not pin the host
334+ // memory to physical addresses unlike `cudaHostAlloc`, the subsequent
335+ // memory copy from host to device is sync and cannot be pushed to the
336+ // CUDA stream as an async operation.
337+ let device_buf = self . ctx . ensure_on_device_sync ( buffer) ?;
328338 let ptr = device_buf. cuda_device_ptr ( ) ?;
329339 self . device_buffers . push ( device_buf) ;
330340 Ok ( Pipeline {
331341 source : SourceOp :: load ( ) ,
332342 scalar_ops : vec ! [ ] ,
333- input_ptr : ptr as u64 ,
343+ input_ptr : ptr,
334344 } )
335345 }
336346
@@ -354,13 +364,13 @@ impl PlanBuilderState<'_> {
354364 vortex_bail ! ( "Dynamic dispatch does not support BitPackedArray with patches" ) ;
355365 }
356366
357- let device_buf = block_on ( self . ctx . ensure_on_device ( packed) ) ?;
367+ let device_buf = self . ctx . ensure_on_device_sync ( packed) ?;
358368 let ptr = device_buf. cuda_device_ptr ( ) ?;
359369 self . device_buffers . push ( device_buf) ;
360370 Ok ( Pipeline {
361371 source : SourceOp :: bitunpack ( bit_width, offset) ,
362372 scalar_ops : vec ! [ ] ,
363- input_ptr : ptr as u64 ,
373+ input_ptr : ptr,
364374 } )
365375 }
366376
0 commit comments