Skip to content

Commit bd636fc

Browse files
k-popovyoonhyejin
authored andcommitted
feat(ingest): Add metabase database id to platform instance mapping (#8359)
1 parent 9e59bf9 commit bd636fc

File tree

4 files changed

+102
-7
lines changed

4 files changed

+102
-7
lines changed

metadata-ingestion/docs/sources/metabase/metabase.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,14 @@ the underlying datasets in the `glue` platform, the following snippet can be use
99
DataHub will try to determine database name from Metabase [api/database](https://www.metabase.com/docs/latest/api-documentation.html#database)
1010
payload. However, the name can be overridden from `database_alias_map` for a given database connected to Metabase.
1111

12+
If several platform instances with the same platform (e.g. from several distinct clickhouse clusters) are present in DataHub,
13+
the mapping between database id in Metabase and platform instance in DataHub may be configured with the following map:
14+
```yml
15+
database_id_to_instance_map:
16+
"42": platform_instance_in_datahub
17+
```
18+
The key in this map must be string, not integer although Metabase API provides `id` as number.
19+
If `database_id_to_instance_map` is not specified, `platform_instance_map` is used for platform instance mapping. If none of the above are specified, platform instance is not used when constructing `urn` when searching for dataset relations.
1220
## Compatibility
1321

1422
Metabase version [v0.41.2](https://www.metabase.com/start/oss/)

metadata-ingestion/docs/sources/metabase/metabase.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ source:
1515
# Optional mapping of platform types to instance ids
1616
platform_instance_map: # optional
1717
postgres: test_postgres # optional
18+
database_id_to_instance_map: # optional
19+
"42": platform_instance_in_datahub # optional
1820

1921
sink:
20-
# sink configs
22+
# sink configs

metadata-ingestion/src/datahub/ingestion/source/metabase.py

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from datetime import datetime, timezone
22
from functools import lru_cache
3-
from typing import Dict, Iterable, List, Optional
3+
from typing import Dict, Iterable, List, Optional, Union
44

55
import dateutil.parser as dp
66
import pydantic
@@ -60,6 +60,10 @@ class MetabaseConfig(DatasetLineageProviderConfigBase):
6060
default=None,
6161
description="Custom mappings between metabase database engines and DataHub platforms",
6262
)
63+
database_id_to_instance_map: Optional[Dict[str, str]] = Field(
64+
default=None,
65+
description="Custom mappings between metabase database id and DataHub platform instance",
66+
)
6367
default_schema: str = Field(
6468
default="public",
6569
description="Default schema name to use when schema is not provided in an SQL query",
@@ -122,7 +126,9 @@ def __init__(self, ctx: PipelineContext, config: MetabaseConfig):
122126
super().__init__(ctx)
123127
self.config = config
124128
self.report = SourceReport()
129+
self.setup_session()
125130

131+
def setup_session(self) -> None:
126132
login_response = requests.post(
127133
f"{self.config.connect_uri}/api/session",
128134
None,
@@ -272,6 +278,16 @@ def _get_ownership(self, creator_id: int) -> Optional[OwnershipClass]:
272278
user_info_response.raise_for_status()
273279
user_details = user_info_response.json()
274280
except HTTPError as http_error:
281+
if (
282+
http_error.response is not None
283+
and http_error.response.status_code == 404
284+
):
285+
self.report.report_warning(
286+
key=f"metabase-user-{creator_id}",
287+
reason=f"User {creator_id} is blocked in Metabase or missing",
288+
)
289+
return None
290+
# For cases when the error is not 404 but something else
275291
self.report.report_failure(
276292
key=f"metabase-user-{creator_id}",
277293
reason=f"Unable to retrieve User info. " f"Reason: {str(http_error)}",
@@ -524,6 +540,36 @@ def get_source_table_from_id(self, table_id):
524540

525541
return None, None
526542

543+
@lru_cache(maxsize=None)
544+
def get_platform_instance(
545+
self, platform: Union[str, None] = None, datasource_id: Union[int, None] = None
546+
) -> Union[str, None]:
547+
"""
548+
Method will attempt to detect `platform_instance` by checking
549+
`database_id_to_instance_map` and `platform_instance_map` mappings.
550+
If `database_id_to_instance_map` is defined it is first checked for
551+
`datasource_id` extracted from Metabase. If this mapping is not defined
552+
or corresponding key is not found, `platform_instance_map` mapping
553+
is checked for datasource platform. If no mapping found `None`
554+
is returned.
555+
:param str platform: DataHub platform name (e.g. `postgres` or `clickhouse`)
556+
:param int datasource_id: Numeric datasource ID received from Metabase API
557+
:return: platform instance name or None
558+
"""
559+
platform_instance = None
560+
# For cases when metabase has several platform instances (e.g. several individual ClickHouse clusters)
561+
if datasource_id is not None and self.config.database_id_to_instance_map:
562+
platform_instance = self.config.database_id_to_instance_map.get(
563+
str(datasource_id)
564+
)
565+
566+
# If Metabase datasource ID is not mapped to platform instace, fall back to platform mapping
567+
# Set platform_instance if configuration provides a mapping from platform name to instance
568+
if platform and self.config.platform_instance_map and platform_instance is None:
569+
platform_instance = self.config.platform_instance_map.get(platform)
570+
571+
return platform_instance
572+
527573
@lru_cache(maxsize=None)
528574
def get_datasource_from_id(self, datasource_id):
529575
try:
@@ -564,11 +610,8 @@ def get_datasource_from_id(self, datasource_id):
564610
reason=f"Platform was not found in DataHub. Using {platform} name as is",
565611
)
566612

567-
# Set platform_instance if configuration provides a mapping from platform name to instance
568-
platform_instance = (
569-
self.config.platform_instance_map.get(platform)
570-
if self.config.platform_instance_map
571-
else None
613+
platform_instance = self.get_platform_instance(
614+
platform, dataset_json.get("id", None)
572615
)
573616

574617
field_for_dbname_mapping = {
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from datahub.ingestion.api.common import PipelineContext
2+
from datahub.ingestion.api.source import SourceReport
3+
from datahub.ingestion.source.metabase import MetabaseConfig, MetabaseSource
4+
5+
6+
class TestMetabaseSource(MetabaseSource):
7+
def __init__(self, ctx: PipelineContext, config: MetabaseConfig):
8+
self.config = config
9+
self.report = SourceReport()
10+
11+
12+
def test_get_platform_instance():
13+
ctx = PipelineContext(run_id="test-metabase")
14+
config = MetabaseConfig()
15+
config.connect_uri = "http://localhost:3000"
16+
# config.database_id_to_instance_map = {"42": "my_main_clickhouse"}
17+
# config.platform_instance_map = {"clickhouse": "my_only_clickhouse"}
18+
metabase = TestMetabaseSource(ctx, config)
19+
20+
# no mappings defined
21+
assert metabase.get_platform_instance("clickhouse", 42) is None
22+
23+
# database_id_to_instance_map is defined, key is present
24+
metabase.config.database_id_to_instance_map = {"42": "my_main_clickhouse"}
25+
assert metabase.get_platform_instance(None, 42) == "my_main_clickhouse"
26+
27+
# database_id_to_instance_map is defined, key is missing
28+
assert metabase.get_platform_instance(None, 999) is None
29+
30+
# database_id_to_instance_map is defined, key is missing, platform_instance_map is defined and key present
31+
metabase.config.platform_instance_map = {"clickhouse": "my_only_clickhouse"}
32+
assert metabase.get_platform_instance("clickhouse", 999) == "my_only_clickhouse"
33+
34+
# database_id_to_instance_map is defined, key is missing, platform_instance_map is defined and key missing
35+
assert metabase.get_platform_instance("missing-platform", 999) is None
36+
37+
# database_id_to_instance_map is missing, platform_instance_map is defined and key present
38+
metabase.config.database_id_to_instance_map = None
39+
assert metabase.get_platform_instance("clickhouse", 999) == "my_only_clickhouse"
40+
41+
# database_id_to_instance_map is missing, platform_instance_map is defined and key missing
42+
assert metabase.get_platform_instance("missing-platform", 999) is None

0 commit comments

Comments
 (0)