Skip to content

Commit d7f1d19

Browse files
committed
Add searching feature to Glue Catalog
1 parent 4daad90 commit d7f1d19

File tree

2 files changed

+74
-31
lines changed

2 files changed

+74
-31
lines changed

awswrangler/glue.py

Lines changed: 64 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -413,33 +413,33 @@ def get_databases(self, catalog_id: Optional[str] = None) -> Iterator[Dict[str,
413413
def get_tables(self,
414414
catalog_id: Optional[str] = None,
415415
database: Optional[str] = None,
416-
search: Optional[str] = None,
417-
prefix: Optional[str] = None,
418-
suffix: Optional[str] = None) -> Iterator[Dict[str, Any]]:
416+
name_contains: Optional[str] = None,
417+
name_prefix: Optional[str] = None,
418+
name_suffix: Optional[str] = None) -> Iterator[Dict[str, Any]]:
419419
"""
420420
Get an iterator of tables
421421
422422
:param catalog_id: The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default.
423423
:param database: Filter a specific database
424-
:param search: Select by a specific string on table name
425-
:param prefix: Select by a specific prefix on table name
426-
:param suffix: Select by a specific suffix on table name
424+
:param name_contains: Select by a specific string on table name
425+
:param name_prefix: Select by a specific prefix on table name
426+
:param name_suffix: Select by a specific suffix on table name
427427
:return: Iterator[Dict[str, Any]] of Tables
428428
"""
429429
paginator = self._client_glue.get_paginator("get_tables")
430430
args: Dict[str, str] = {}
431431
if catalog_id is not None:
432432
args["CatalogId"] = catalog_id
433-
if (prefix is not None) and (suffix is not None) and (search is not None):
434-
args["Expression"] = f"{prefix}.*{search}.*{suffix}"
435-
elif (prefix is not None) and (suffix is not None):
436-
args["Expression"] = f"{prefix}.*{suffix}"
437-
elif search is not None:
438-
args["Expression"] = f".*{search}.*"
439-
elif prefix is not None:
440-
args["Expression"] = f"{prefix}.*"
441-
elif suffix is not None:
442-
args["Expression"] = f".*{suffix}"
433+
if (name_prefix is not None) and (name_suffix is not None) and (name_contains is not None):
434+
args["Expression"] = f"{name_prefix}.*{name_contains}.*{name_suffix}"
435+
elif (name_prefix is not None) and (name_suffix is not None):
436+
args["Expression"] = f"{name_prefix}.*{name_suffix}"
437+
elif name_contains is not None:
438+
args["Expression"] = f".*{name_contains}.*"
439+
elif name_prefix is not None:
440+
args["Expression"] = f"{name_prefix}.*"
441+
elif name_suffix is not None:
442+
args["Expression"] = f".*{name_suffix}"
443443
if database is not None:
444444
databases = [database]
445445
else:
@@ -455,27 +455,41 @@ def tables(self,
455455
limit: int = 100,
456456
catalog_id: Optional[str] = None,
457457
database: Optional[str] = None,
458-
search: Optional[str] = None,
459-
prefix: Optional[str] = None,
460-
suffix: Optional[str] = None) -> DataFrame:
458+
search_text: Optional[str] = None,
459+
name_contains: Optional[str] = None,
460+
name_prefix: Optional[str] = None,
461+
name_suffix: Optional[str] = None) -> DataFrame:
461462
"""
462-
Get iterator of tables filtered by a search term, prefix, suffix.
463+
Get a Dataframe with tables filtered by a search term, prefix, suffix.
463464
464465
:param limit: Max number of tables
465466
:param catalog_id: The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default.
466467
:param database: Glue database name
467-
:param search: Select only tables with the given string in the name.
468-
:param prefix: Select only tables with the given string in the name prefix.
469-
:param suffix: Select only tables with the given string in the name suffix.
470-
468+
:param search_text: Select only tables with the given string in table's properties
469+
:param name_contains: Select by a specific string on table name
470+
:param name_prefix: Select only tables with the given string in the name prefix
471+
:param name_suffix: Select only tables with the given string in the name suffix
471472
:return: Pandas Dataframe filled by formatted infos
472473
"""
473-
table_iter = self.get_tables(catalog_id=catalog_id,
474-
database=database,
475-
search=search,
476-
prefix=prefix,
477-
suffix=suffix)
478-
tables = islice(table_iter, limit)
474+
if search_text is None:
475+
table_iter = self.get_tables(catalog_id=catalog_id,
476+
database=database,
477+
name_contains=name_contains,
478+
name_prefix=name_prefix,
479+
name_suffix=name_suffix)
480+
tables: List[Dict[str, Any]] = list(islice(table_iter, limit))
481+
else:
482+
tables = list(self.search_tables(text=search_text, catalog_id=catalog_id))
483+
if database is not None:
484+
tables = [x for x in tables if x["DatabaseName"] == database]
485+
if name_contains is not None:
486+
tables = [x for x in tables if name_contains in x["Name"]]
487+
if name_prefix is not None:
488+
tables = [x for x in tables if x["Name"].startswith(name_prefix)]
489+
if name_suffix is not None:
490+
tables = [x for x in tables if x["Name"].endswith(name_suffix)]
491+
tables = tables[:limit]
492+
479493
df_dict: Dict[str, List] = {"Database": [], "Table": [], "Description": [], "Columns": [], "Partitions": []}
480494
for table in tables:
481495
df_dict["Database"].append(table["DatabaseName"])
@@ -488,6 +502,26 @@ def tables(self,
488502
df_dict["Partitions"].append(", ".join([x["Name"] for x in table["PartitionKeys"]]))
489503
return DataFrame(data=df_dict)
490504

505+
def search_tables(self, text: str, catalog_id: Optional[str] = None):
506+
"""
507+
Get iterator of tables filtered by a search string.
508+
509+
:param text: Select only tables with the given string in table's properties.
510+
:param catalog_id: The ID of the Data Catalog from which to retrieve Databases. If none is provided, the AWS account ID is used by default.
511+
:return: Iterator of tables
512+
"""
513+
args: Dict[str, Any] = {"SearchText": text}
514+
if catalog_id is not None:
515+
args["CatalogId"] = catalog_id
516+
response = self._client_glue.search_tables(**args)
517+
for tbl in response["TableList"]:
518+
yield tbl
519+
while "NextToken" in response:
520+
args["NextToken"] = response["NextToken"]
521+
response = self._client_glue.search_tables(**args)
522+
for tbl in response["TableList"]:
523+
yield tbl
524+
491525
def databases(self, limit: int = 100, catalog_id: Optional[str] = None) -> DataFrame:
492526
"""
493527
Get iterator of databases.

testing/test_awswrangler/test_glue.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ def test_get_tables_database(session, database):
106106

107107

108108
def test_get_tables_search(session, table):
109-
tables = list(session.glue.get_tables(search=table[1:-1]))
109+
tables = list(session.glue.search_tables(text="parquet"))
110110
assert len(tables) > 0
111111
for tbl in tables:
112112
if tbl["Name"] == table:
@@ -133,3 +133,12 @@ def test_glue_utils(session, database, table):
133133
assert len(session.glue.databases().index) > 1
134134
assert len(session.glue.tables().index) > 1
135135
assert len(session.glue.table(database=database, name=table).index) > 1
136+
137+
138+
def test_glue_tables_full(session, database, table):
139+
assert len(
140+
session.glue.tables(database=database,
141+
search_text="parquet",
142+
name_contains=table[1:-1],
143+
name_prefix=table[0],
144+
name_suffix=table[-1]).index) > 1

0 commit comments

Comments
 (0)