1- from typing import Dict , Optional , Any , Iterator
1+ from typing import Dict , Optional , Any , Iterator , List
22from math import ceil
33from itertools import islice
44import re
55import logging
66
7- from pandas import DataFrame
7+ from pandas import DataFrame # type: ignore
88
99from awswrangler import data_types
1010from awswrangler .athena import Athena
@@ -410,7 +410,12 @@ def get_databases(self, catalog_id: Optional[str] = None) -> Iterator[Dict[str,
410410 for db in page ["DatabaseList" ]:
411411 yield db
412412
413- def get_tables (self , catalog_id : Optional [str ] = None , database : Optional [str ] = None , search : Optional [str ] = None , prefix : Optional [str ] = None , suffix : Optional [str ] = None ) -> Iterator [Dict [str , Any ]]:
413+ def get_tables (self ,
414+ catalog_id : Optional [str ] = None ,
415+ database : Optional [str ] = None ,
416+ search : Optional [str ] = None ,
417+ prefix : Optional [str ] = None ,
418+ suffix : Optional [str ] = None ) -> Iterator [Dict [str , Any ]]:
414419 """
415420 Get an iterator of tables
416421
@@ -446,16 +451,32 @@ def get_tables(self, catalog_id: Optional[str] = None, database: Optional[str] =
446451 for tbl in page ["TableList" ]:
447452 yield tbl
448453
449- def tables (self , limit : int = 100 , catalog_id : Optional [str ] = None , database : Optional [str ] = None , search : Optional [str ] = None , prefix : Optional [str ] = None , suffix : Optional [str ] = None ) -> DataFrame :
450- table_iter = self .get_tables (catalog_id = catalog_id , database = database , search = search , prefix = prefix , suffix = suffix )
454+ def tables (self ,
455+ limit : int = 100 ,
456+ catalog_id : Optional [str ] = None ,
457+ database : Optional [str ] = None ,
458+ search : Optional [str ] = None ,
459+ prefix : Optional [str ] = None ,
460+ suffix : Optional [str ] = None ) -> DataFrame :
461+ """
462+ Get iterator of tables filtered by a search term, prefix, suffix.
463+
464+ :param limit: Max number of tables
465+ :param catalog_id: The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default.
466+ :param database: Glue database name
467+ :param search: Select only tables with the given string in the name.
468+ :param prefix: Select only tables with the given string in the name prefix.
469+ :param suffix: Select only tables with the given string in the name suffix.
470+
471+ :return: Pandas Dataframe filled by formatted infos
472+ """
473+ table_iter = self .get_tables (catalog_id = catalog_id ,
474+ database = database ,
475+ search = search ,
476+ prefix = prefix ,
477+ suffix = suffix )
451478 tables = islice (table_iter , limit )
452- df_dict = {
453- "Database" : [],
454- "Table" : [],
455- "Description" : [],
456- "Columns" : [],
457- "Partitions" : []
458- }
479+ df_dict : Dict [str , List ] = {"Database" : [], "Table" : [], "Description" : [], "Columns" : [], "Partitions" : []}
459480 for table in tables :
460481 df_dict ["Database" ].append (table ["DatabaseName" ])
461482 df_dict ["Table" ].append (table ["Name" ])
@@ -468,12 +489,16 @@ def tables(self, limit: int = 100, catalog_id: Optional[str] = None, database: O
468489 return DataFrame (data = df_dict )
469490
470491 def databases (self , limit : int = 100 , catalog_id : Optional [str ] = None ) -> DataFrame :
492+ """
493+ Get iterator of databases.
494+
495+ :param limit: Max number of tables
496+ :param catalog_id: The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default.
497+ :return: Pandas Dataframe filled by formatted infos
498+ """
471499 database_iter = self .get_databases (catalog_id = catalog_id )
472500 dbs = islice (database_iter , limit )
473- df_dict = {
474- "Database" : [],
475- "Description" : []
476- }
501+ df_dict : Dict [str , List ] = {"Database" : [], "Description" : []}
477502 for db in dbs :
478503 df_dict ["Database" ].append (db ["Name" ])
479504 if "Description" in db :
@@ -483,23 +508,19 @@ def databases(self, limit: int = 100, catalog_id: Optional[str] = None) -> DataF
483508 return DataFrame (data = df_dict )
484509
485510 def table (self , database : str , name : str , catalog_id : Optional [str ] = None ) -> DataFrame :
511+ """
512+ Get table details as Pandas Dataframe
513+
514+ :param database: Glue database name
515+ :param name: Table name
516+ :param catalog_id: The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default.
517+ :return: Pandas Dataframe filled by formatted infos
518+ """
486519 if catalog_id is None :
487- table : Dict [str , Any ] = self ._client_glue .get_table (
488- DatabaseName = database ,
489- Name = name
490- )["Table" ]
520+ table : Dict [str , Any ] = self ._client_glue .get_table (DatabaseName = database , Name = name )["Table" ]
491521 else :
492- table = self ._client_glue .get_table (
493- CatalogId = catalog_id ,
494- DatabaseName = database ,
495- Name = name
496- )["Table" ]
497- df_dict = {
498- "Column Name" : [],
499- "Type" : [],
500- "Partition" : [],
501- "Comment" : []
502- }
522+ table = self ._client_glue .get_table (CatalogId = catalog_id , DatabaseName = database , Name = name )["Table" ]
523+ df_dict : Dict [str , List ] = {"Column Name" : [], "Type" : [], "Partition" : [], "Comment" : []}
503524 for col in table ["StorageDescriptor" ]["Columns" ]:
504525 df_dict ["Column Name" ].append (col ["Name" ])
505526 df_dict ["Type" ].append (col ["Type" ])
0 commit comments