@@ -329,7 +329,7 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals
329329 database : str ,
330330 ctas_approach : bool = True ,
331331 categories : List [str ] = None ,
332- chunksize : Optional [int ] = None ,
332+ chunksize : Optional [Union [ int , bool ] ] = None ,
333333 s3_output : Optional [str ] = None ,
334334 workgroup : Optional [str ] = None ,
335335 encryption : Optional [str ] = None ,
@@ -353,10 +353,6 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals
353353 CONS: Slower (But stills faster than other libraries that uses the regular Athena API)
354354 and does not handle nested types at all.
355355
356- Note
357- ----
358- If `chunksize` is passed, then a Generator of DataFrames is returned.
359-
360356 Note
361357 ----
362358 If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes,
@@ -367,6 +363,21 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals
367363 Create the default Athena bucket if it doesn't exist and s3_output is None.
368364 (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/)
369365
366+ Note
367+ ----
368+ ``Batching`` (`chunksize` argument) (Memory Friendly):
369+
370+ Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.
371+
372+ There are two batching strategies on Wrangler:
373+
374+ - If **chunksize=True**, a new DataFrame will be returned for each file in the query result.
375+
376+ - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER.
377+
378+ `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise
379+ in number of rows for each Dataframe.
380+
370381 Note
371382 ----
372383 In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
@@ -383,8 +394,10 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals
383394 categories: List[str], optional
384395 List of columns names that should be returned as pandas.Categorical.
385396 Recommended for memory restricted environments.
386- chunksize: int, optional
387- If specified, return an generator where chunksize is the number of rows to include in each chunk.
397+ chunksize : Union[int, bool], optional
398+ If passed will split the data in a Iterable of DataFrames (Memory friendly).
399+ If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize.
400+ If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER.
388401 s3_output : str, optional
389402 AWS S3 path.
390403 workgroup : str, optional
@@ -454,7 +467,7 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals
454467 catalog .delete_table_if_exists (database = database , table = name , boto3_session = session )
455468 manifest_path : str = f"{ _s3_output } /tables/{ query_id } -manifest.csv"
456469 paths : List [str ] = _extract_ctas_manifest_paths (path = manifest_path , boto3_session = session )
457- chunked : bool = chunksize is not None
470+ chunked : Union [ bool , int ] = False if chunksize is None else chunksize
458471 _logger .debug (f"chunked: { chunked } " )
459472 if not paths :
460473 if chunked is False :
@@ -473,6 +486,8 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals
473486 path = f"{ _s3_output } /{ query_id } .csv"
474487 s3 .wait_objects_exist (paths = [path ], use_threads = False , boto3_session = session )
475488 _logger .debug (f"Start CSV reading from { path } " )
489+ _chunksize : Optional [int ] = chunksize if isinstance (chunksize , int ) else None
490+ _logger .debug (f"_chunksize: { _chunksize } " )
476491 ret = s3 .read_csv (
477492 path = [path ],
478493 dtype = dtype ,
@@ -481,7 +496,7 @@ def read_sql_query( # pylint: disable=too-many-branches,too-many-locals
481496 quoting = csv .QUOTE_ALL ,
482497 keep_default_na = False ,
483498 na_values = ["" ],
484- chunksize = chunksize ,
499+ chunksize = _chunksize ,
485500 skip_blank_lines = False ,
486501 use_threads = False ,
487502 boto3_session = session ,
@@ -565,7 +580,7 @@ def read_sql_table(
565580 database : str ,
566581 ctas_approach : bool = True ,
567582 categories : List [str ] = None ,
568- chunksize : Optional [int ] = None ,
583+ chunksize : Optional [Union [ int , bool ] ] = None ,
569584 s3_output : Optional [str ] = None ,
570585 workgroup : Optional [str ] = None ,
571586 encryption : Optional [str ] = None ,
@@ -589,10 +604,6 @@ def read_sql_table(
589604 CONS: Slower (But stills faster than other libraries that uses the regular Athena API)
590605 and does not handle nested types at all
591606
592- Note
593- ----
594- If `chunksize` is passed, then a Generator of DataFrames is returned.
595-
596607 Note
597608 ----
598609 If `ctas_approach` is True, `chunksize` will return non deterministic chunks sizes,
@@ -603,6 +614,21 @@ def read_sql_table(
603614 Create the default Athena bucket if it doesn't exist and s3_output is None.
604615 (E.g. s3://aws-athena-query-results-ACCOUNT-REGION/)
605616
617+ Note
618+ ----
619+ ``Batching`` (`chunksize` argument) (Memory Friendly):
620+
621+ Will anable the function to return a Iterable of DataFrames instead of a regular DataFrame.
622+
623+ There are two batching strategies on Wrangler:
624+
625+ - If **chunksize=True**, a new DataFrame will be returned for each file in the query result.
626+
627+ - If **chunked=INTEGER**, Wrangler will iterate on the data by number of rows igual the received INTEGER.
628+
629+ `P.S.` `chunksize=True` if faster and uses less memory while `chunksize=INTEGER` is more precise
630+ in number of rows for each Dataframe.
631+
606632 Note
607633 ----
608634 In case of `use_threads=True` the number of threads that will be spawned will be get from os.cpu_count().
@@ -619,8 +645,10 @@ def read_sql_table(
619645 categories: List[str], optional
620646 List of columns names that should be returned as pandas.Categorical.
621647 Recommended for memory restricted environments.
622- chunksize: int, optional
623- If specified, return an generator where chunksize is the number of rows to include in each chunk.
648+ chunksize : Union[int, bool], optional
649+ If passed will split the data in a Iterable of DataFrames (Memory friendly).
650+ If `True` wrangler will iterate on the data by files in the most efficient way without guarantee of chunksize.
651+ If an `INTEGER` is passed Wrangler will iterate on the data by number of rows igual the received INTEGER.
624652 s3_output : str, optional
625653 AWS S3 path.
626654 workgroup : str, optional
@@ -646,6 +674,7 @@ def read_sql_table(
646674 >>> df = wr.athena.read_sql_table(table='...', database='...')
647675
648676 """
677+ table = catalog .sanitize_table_name (table = table )
649678 return read_sql_query (
650679 sql = f'SELECT * FROM "{ table } "' ,
651680 database = database ,
0 commit comments