@@ -86,9 +86,7 @@ async def wait_for(coros, name=""):
8686 # wrap the coro in a task to work with python 3.10 and 3.11+ where asyncio.wait semantics
8787 # changed to not accept any awaitable
8888 start = time .time ()
89- done , _ = await asyncio .wait (
90- [asyncio .create_task (_ensure_coro (c )) for c in coros ]
91- )
89+ done , _ = await asyncio .wait ([asyncio .create_task (_ensure_coro (c )) for c in coros ])
9290 end = time .time ()
9391 log .info (f"waiting for { name } took { end - start } s" )
9492 for d in done :
@@ -166,9 +164,7 @@ async def acquire(self, need=1):
166164 need_to_make = need - have
167165
168166 if need_to_make > can_make :
169- raise Exception (
170- f"Cannot allocate workers above { self .max_workers } "
171- )
167+ raise Exception (f"Cannot allocate workers above { self .max_workers } " )
172168
173169 if need_to_make > 0 :
174170 log .debug (f"creating { need_to_make } additional processors" )
@@ -197,9 +193,9 @@ def _new_processor(self):
197193 self .processors_ready .clear ()
198194 processor_key = new_friendly_name ()
199195 log .debug (f"starting processor: { processor_key } " )
200- processor = DFRayProcessor .options (
201- name = f"Processor : { processor_key } "
202- ). remote ( processor_key )
196+ processor = DFRayProcessor .options (name = f"Processor : { processor_key } " ). remote (
197+ processor_key
198+ )
203199 self .pool [processor_key ] = processor
204200 self .processors_started .add (processor .start_up .remote ())
205201 self .available .add (processor_key )
@@ -248,9 +244,7 @@ async def _wait_for_serve(self):
248244
249245 async def all_done (self ):
250246 log .info ("calling processor all done" )
251- refs = [
252- processor .all_done .remote () for processor in self .pool .values ()
253- ]
247+ refs = [processor .all_done .remote () for processor in self .pool .values ()]
254248 await wait_for (refs , "processors to be all done" )
255249 log .info ("all processors shutdown" )
256250
@@ -293,9 +287,7 @@ async def update_plan(
293287 )
294288
295289 async def serve (self ):
296- log .info (
297- f"[{ self .processor_key } ] serving on { self .processor_service .addr ()} "
298- )
290+ log .info (f"[{ self .processor_key } ] serving on { self .processor_service .addr ()} " )
299291 await self .processor_service .serve ()
300292 log .info (f"[{ self .processor_key } ] done serving" )
301293
@@ -332,9 +324,7 @@ def __init__(
332324 worker_pool_min : int ,
333325 worker_pool_max : int ,
334326 ) -> None :
335- log .info (
336- f"Creating DFRayContextSupervisor worker_pool_min: { worker_pool_min } "
337- )
327+ log .info (f"Creating DFRayContextSupervisor worker_pool_min: { worker_pool_min } " )
338328 self .pool = DFRayProcessorPool (worker_pool_min , worker_pool_max )
339329 self .stages : dict [str , InternalStageData ] = {}
340330 log .info ("Created DFRayContextSupervisor" )
@@ -347,9 +337,7 @@ async def wait_for_ready(self):
347337
348338 async def get_stage_addrs (self , stage_id : int ):
349339 addrs = [
350- sd .remote_addr
351- for sd in self .stages .values ()
352- if sd .stage_id == stage_id
340+ sd .remote_addr for sd in self .stages .values () if sd .stage_id == stage_id
353341 ]
354342 return addrs
355343
@@ -399,10 +387,7 @@ async def new_query(
399387 refs .append (
400388 isd .remote_processor .update_plan .remote (
401389 isd .stage_id ,
402- {
403- stage_id : val ["child_addrs" ]
404- for (stage_id , val ) in kid .items ()
405- },
390+ {stage_id : val ["child_addrs" ] for (stage_id , val ) in kid .items ()},
406391 isd .partition_group ,
407392 isd .plan_bytes ,
408393 )
@@ -434,9 +419,7 @@ async def sort_out_addresses(self):
434419 ]
435420
436421 # sanity check
437- assert all (
438- [op == output_partitions [0 ] for op in output_partitions ]
439- )
422+ assert all ([op == output_partitions [0 ] for op in output_partitions ])
440423 output_partitions = output_partitions [0 ]
441424
442425 for child_stage_isd in child_stage_datas :
@@ -520,9 +503,7 @@ def collect(self) -> list[pa.RecordBatch]:
520503 )
521504 log .debug (f"last stage addrs { last_stage_addrs } " )
522505
523- reader = self .df .read_final_stage (
524- last_stage_id , last_stage_addrs [0 ]
525- )
506+ reader = self .df .read_final_stage (last_stage_id , last_stage_addrs [0 ])
526507 log .debug ("got reader" )
527508 self ._batches = list (reader )
528509 return self ._batches
@@ -589,11 +570,55 @@ def __init__(
589570 )
590571
591572 def register_parquet (self , name : str , path : str ):
573+ """
574+ Register a Parquet file with the given name and path.
575+ The path can be a local filesystem path, absolute filesystem path, or a url.
576+
577+ If the path is a object store url, the appropriate object store will be registered.
578+ Configuration of the object store will be gathered from the environment.
579+
580+ For example for s3:// urls, credentials will be looked for by the AWS SDK,
581+ which will check environment variables, credential files, etc
582+
583+ Parameters:
584+ path (str): The file path to the Parquet file.
585+ name (str): The name to register the Parquet file under.
586+ """
592587 self .ctx .register_parquet (name , path )
593588
594- def register_listing_table (
595- self , name : str , path : str , file_extention = "parquet"
596- ):
589+ def register_csv (self , name : str , path : str ):
590+ """
591+ Register a csvfile with the given name and path.
592+ The path can be a local filesystem path, absolute filesystem path, or a url.
593+
594+ If the path is a object store url, the appropriate object store will be registered.
595+ Configuration of the object store will be gathered from the environment.
596+
597+ For example for s3:// urls, credentials will be looked for by the AWS SDK,
598+ which will check environment variables, credential files, etc
599+
600+ Parameters:
601+ path (str): The file path to the csv file.
602+ name (str): The name to register the Parquet file under.
603+ """
604+ self .ctx .register_csv (name , path )
605+
606+ def register_listing_table (self , name : str , path : str , file_extention = "parquet" ):
607+ """
608+ Register a directory of parquet files with the given name.
609+ The path can be a local filesystem path, absolute filesystem path, or a url.
610+
611+ If the path is a object store url, the appropriate object store will be registered.
612+ Configuration of the object store will be gathered from the environment.
613+
614+ For example for s3:// urls, credentials will be looked for by the AWS SDK,
615+ which will check environment variables, credential files, etc
616+
617+ Parameters:
618+ path (str): The file path to the Parquet file directory
619+ name (str): The name to register the Parquet file under.
620+ """
621+
597622 self .ctx .register_listing_table (name , path , file_extention )
598623
599624 def sql (self , query : str ) -> DFRayDataFrame :
0 commit comments