Skip to content

Commit f860f79

Browse files
authored
fix(ingest/transformer): replace externalUrl in dataset properties (#10281)
1 parent 771ab0d commit f860f79

File tree

4 files changed

+169
-3
lines changed

4 files changed

+169
-3
lines changed

metadata-ingestion/docs/transformer/dataset_transformer.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,24 @@ Then define your class to return a list of custom properties, for example:
907907
add_properties_resolver_class: "<your_module>.<your_class>"
908908
```
909909

910+
## Replace ExternalUrl
911+
### Config Details
912+
| Field | Required | Type | Default | Description |
913+
|-----------------------------|----------|---------|---------------|---------------------------------------------|
914+
| `input_pattern` | ✅ | string | | String or pattern to replace |
915+
| `replacement` | ✅ | string | | Replacement string |
916+
917+
918+
Matches the full/partial string in the externalUrl of the dataset properties and replace that with the replacement string
919+
920+
```yaml
921+
transformers:
922+
- type: "replace_external_url"
923+
config:
924+
input_pattern: '\b\w*hub\b'
925+
replacement: "sub"
926+
```
927+
910928
## Simple Add Dataset domains
911929
### Config Details
912930
| Field | Required | Type | Default | Description |

metadata-ingestion/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -687,6 +687,7 @@
687687
"add_dataset_dataproduct = datahub.ingestion.transformer.add_dataset_dataproduct:AddDatasetDataProduct",
688688
"simple_add_dataset_dataproduct = datahub.ingestion.transformer.add_dataset_dataproduct:SimpleAddDatasetDataProduct",
689689
"pattern_add_dataset_dataproduct = datahub.ingestion.transformer.add_dataset_dataproduct:PatternAddDatasetDataProduct",
690+
"replace_external_url = datahub.ingestion.transformer.replace_external_url:ReplaceExternalUrl"
690691
],
691692
"datahub.ingestion.sink.plugins": [
692693
"file = datahub.ingestion.sink.file:FileSink",
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import copy
2+
import re
3+
from typing import Any, Dict, Optional, cast
4+
5+
from datahub.configuration.common import ConfigModel
6+
from datahub.emitter.mce_builder import Aspect
7+
from datahub.ingestion.api.common import PipelineContext
8+
from datahub.ingestion.transformer.dataset_transformer import (
9+
DatasetPropertiesTransformer,
10+
)
11+
from datahub.metadata.schema_classes import DatasetPropertiesClass
12+
13+
14+
class ReplaceExternalUrlConfig(ConfigModel):
15+
input_pattern: str
16+
replacement: str
17+
18+
19+
class ReplaceExternalUrl(DatasetPropertiesTransformer):
20+
"""Transformer that clean the ownership URN."""
21+
22+
ctx: PipelineContext
23+
config: ReplaceExternalUrlConfig
24+
25+
def __init__(
26+
self,
27+
config: ReplaceExternalUrlConfig,
28+
ctx: PipelineContext,
29+
**resolver_args: Dict[str, Any],
30+
):
31+
super().__init__()
32+
self.ctx = ctx
33+
self.config = config
34+
self.resolver_args = resolver_args
35+
36+
@classmethod
37+
def create(cls, config_dict: dict, ctx: PipelineContext) -> "ReplaceExternalUrl":
38+
config = ReplaceExternalUrlConfig.parse_obj(config_dict)
39+
return cls(config, ctx)
40+
41+
def transform_aspect(
42+
self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]
43+
) -> Optional[Aspect]:
44+
in_dataset_properties_aspect: DatasetPropertiesClass = cast(
45+
DatasetPropertiesClass, aspect
46+
)
47+
48+
if (
49+
not hasattr(in_dataset_properties_aspect, "externalUrl")
50+
or not in_dataset_properties_aspect.externalUrl
51+
):
52+
return cast(Aspect, in_dataset_properties_aspect)
53+
else:
54+
out_dataset_properties_aspect: DatasetPropertiesClass = copy.deepcopy(
55+
in_dataset_properties_aspect
56+
)
57+
58+
pattern = re.compile(self.config.input_pattern)
59+
replacement = self.config.replacement
60+
61+
out_dataset_properties_aspect.externalUrl = re.sub(
62+
pattern, replacement, in_dataset_properties_aspect.externalUrl
63+
)
64+
65+
return cast(Aspect, out_dataset_properties_aspect)

metadata-ingestion/tests/unit/test_transform_dataset.py

Lines changed: 85 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,13 @@
7272
ExtractOwnersFromTagsTransformer,
7373
)
7474
from datahub.ingestion.transformer.mark_dataset_status import MarkDatasetStatus
75+
from datahub.ingestion.transformer.pattern_cleanup_ownership import (
76+
PatternCleanUpOwnership,
77+
)
7578
from datahub.ingestion.transformer.remove_dataset_ownership import (
7679
SimpleRemoveDatasetOwnership,
7780
)
81+
from datahub.ingestion.transformer.replace_external_url import ReplaceExternalUrl
7882
from datahub.metadata.schema_classes import (
7983
BrowsePathsClass,
8084
DatasetPropertiesClass,
@@ -87,9 +91,6 @@
8791
)
8892
from datahub.utilities.urns.dataset_urn import DatasetUrn
8993
from datahub.utilities.urns.urn import Urn
90-
from src.datahub.ingestion.transformer.pattern_cleanup_ownership import (
91-
PatternCleanUpOwnership,
92-
)
9394

9495

9596
def make_generic_dataset(
@@ -3209,3 +3210,84 @@ def test_clean_owner_urn_transformation_should_not_remove_system_identifier(
32093210
config: List[Union[re.Pattern, str]] = ["urn:li:corpuser:"]
32103211

32113212
_test_clean_owner_urns(pipeline_context, in_owner_urns, config, in_owner_urns)
3213+
3214+
3215+
def test_replace_external_url_word_replace(
3216+
mock_datahub_graph,
3217+
):
3218+
pipeline_context: PipelineContext = PipelineContext(
3219+
run_id="test_replace_external_url"
3220+
)
3221+
pipeline_context.graph = mock_datahub_graph(DatahubClientConfig)
3222+
3223+
output = run_dataset_transformer_pipeline(
3224+
transformer_type=ReplaceExternalUrl,
3225+
aspect=models.DatasetPropertiesClass(
3226+
externalUrl="https://github.com/datahub/looker-demo/blob/master/foo.view.lkml",
3227+
customProperties=EXISTING_PROPERTIES.copy(),
3228+
),
3229+
config={"input_pattern": "datahub", "replacement": "starhub"},
3230+
pipeline_context=pipeline_context,
3231+
)
3232+
3233+
assert len(output) == 2
3234+
assert output[0].record
3235+
assert output[0].record.aspect
3236+
assert (
3237+
output[0].record.aspect.externalUrl
3238+
== "https://github.com/starhub/looker-demo/blob/master/foo.view.lkml"
3239+
)
3240+
3241+
3242+
def test_replace_external_regex_replace_1(
3243+
mock_datahub_graph,
3244+
):
3245+
pipeline_context: PipelineContext = PipelineContext(
3246+
run_id="test_replace_external_url"
3247+
)
3248+
pipeline_context.graph = mock_datahub_graph(DatahubClientConfig)
3249+
3250+
output = run_dataset_transformer_pipeline(
3251+
transformer_type=ReplaceExternalUrl,
3252+
aspect=models.DatasetPropertiesClass(
3253+
externalUrl="https://github.com/datahub/looker-demo/blob/master/foo.view.lkml",
3254+
customProperties=EXISTING_PROPERTIES.copy(),
3255+
),
3256+
config={"input_pattern": r"datahub/.*/", "replacement": "starhub/test/"},
3257+
pipeline_context=pipeline_context,
3258+
)
3259+
3260+
assert len(output) == 2
3261+
assert output[0].record
3262+
assert output[0].record.aspect
3263+
assert (
3264+
output[0].record.aspect.externalUrl
3265+
== "https://github.com/starhub/test/foo.view.lkml"
3266+
)
3267+
3268+
3269+
def test_replace_external_regex_replace_2(
3270+
mock_datahub_graph,
3271+
):
3272+
pipeline_context: PipelineContext = PipelineContext(
3273+
run_id="test_replace_external_url"
3274+
)
3275+
pipeline_context.graph = mock_datahub_graph(DatahubClientConfig)
3276+
3277+
output = run_dataset_transformer_pipeline(
3278+
transformer_type=ReplaceExternalUrl,
3279+
aspect=models.DatasetPropertiesClass(
3280+
externalUrl="https://github.com/datahub/looker-demo/blob/master/foo.view.lkml",
3281+
customProperties=EXISTING_PROPERTIES.copy(),
3282+
),
3283+
config={"input_pattern": r"\b\w*hub\b", "replacement": "test"},
3284+
pipeline_context=pipeline_context,
3285+
)
3286+
3287+
assert len(output) == 2
3288+
assert output[0].record
3289+
assert output[0].record.aspect
3290+
assert (
3291+
output[0].record.aspect.externalUrl
3292+
== "https://test.com/test/looker-demo/blob/master/foo.view.lkml"
3293+
)

0 commit comments

Comments
 (0)