Skip to content

Commit 5a15697

Browse files
authored
feat(docs): add structured property for search field names in metadata model (#15097)
1 parent 7372b91 commit 5a15697

File tree

2 files changed

+105
-1
lines changed

2 files changed

+105
-1
lines changed

.github/workflows/metadata-model.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ on:
55
- master
66
paths:
77
- "metadata-models/**"
8+
- "metadata-ingestion/scripts/modeldocgen.py"
9+
- "metadata-ingestion/scripts/modeldocupload.sh"
810
release:
911
types: [published]
1012

metadata-ingestion/scripts/modeldocgen.py

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,14 @@
3939
SchemaFieldDataTypeClass,
4040
SchemaMetadataClass,
4141
StringTypeClass,
42+
StructuredPropertiesClass,
43+
StructuredPropertyDefinitionClass,
44+
StructuredPropertySettingsClass,
45+
StructuredPropertyValueAssignmentClass,
4246
SubTypesClass,
4347
TagAssociationClass,
4448
)
49+
from datahub.metadata.urns import StructuredPropertyUrn, Urn
4550

4651
logger = logging.getLogger(__name__)
4752

@@ -888,6 +893,54 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
888893
raise Exception(f"Failed to find information for entity: {entity_name}")
889894

890895

896+
def create_search_field_name_property() -> List[MetadataChangeProposalWrapper]:
897+
"""
898+
Create the structured property for documenting search field names.
899+
900+
This property is used to capture the actual field name used in the search index
901+
when it differs from the field name in the schema (e.g., 'instance' field is
902+
indexed as 'platformInstance').
903+
904+
Returns:
905+
List of MCPs for the property definition and settings
906+
"""
907+
property_id = "com.datahub.metadata.searchFieldName"
908+
property_urn = str(
909+
StructuredPropertyUrn.from_string(f"urn:li:structuredProperty:{property_id}")
910+
)
911+
912+
# Create property definition
913+
definition_mcp = MetadataChangeProposalWrapper(
914+
entityUrn=property_urn,
915+
aspect=StructuredPropertyDefinitionClass(
916+
qualifiedName=property_id,
917+
displayName="Search Field Name",
918+
valueType=Urn.make_data_type_urn("string"),
919+
description=(
920+
"The field name used in the search index when it differs from the schema field name. "
921+
"Use this field name when constructing search queries for this field."
922+
),
923+
entityTypes=[Urn.make_entity_type_urn("schemaField")],
924+
cardinality="SINGLE",
925+
immutable=False,
926+
),
927+
)
928+
929+
# Create property settings for display
930+
settings_mcp = MetadataChangeProposalWrapper(
931+
entityUrn=property_urn,
932+
aspect=StructuredPropertySettingsClass(
933+
isHidden=False,
934+
showInSearchFilters=False,
935+
showInAssetSummary=True,
936+
showAsAssetBadge=False,
937+
showInColumnsTable=True, # Show as a column in schema tables
938+
),
939+
)
940+
941+
return [definition_mcp, settings_mcp]
942+
943+
891944
def generate_stitched_record(
892945
relnships_graph: RelationshipGraph,
893946
) -> Iterable[MetadataChangeProposalWrapper]:
@@ -897,6 +950,11 @@ def strip_types(field_path: str) -> str:
897950
final_path = re.sub(r"^\[version=2.0\]\.", "", final_path)
898951
return final_path
899952

953+
# Track schema fields that need structured properties
954+
schema_field_properties: Dict[
955+
str, str
956+
] = {} # schema_field_urn -> search_field_name
957+
900958
for entity_name, entity_def in entity_registry.items():
901959
entity_display_name = entity_def.display_name
902960
entity_fields = []
@@ -981,6 +1039,28 @@ def strip_types(field_path: str) -> str:
9811039
f_field.globalTags.tags.append(
9821040
TagAssociationClass(tag="urn:li:tag:Searchable")
9831041
)
1042+
1043+
# Check if search field name differs from actual field name
1044+
searchable_config = json_dict["Searchable"]
1045+
if (
1046+
isinstance(searchable_config, dict)
1047+
and "fieldName" in searchable_config
1048+
):
1049+
search_field_name = searchable_config["fieldName"]
1050+
# Extract the actual field name from the field path
1051+
# Field path format: "[version=2.0].[type=...].<fieldName>"
1052+
actual_field_name = strip_types(f_field.fieldPath).split(
1053+
"."
1054+
)[-1]
1055+
1056+
if search_field_name != actual_field_name:
1057+
# Track this for later - we'll emit a separate MCP for the schema field entity
1058+
schema_field_urn = make_schema_field_urn(
1059+
source_dataset_urn, f_field.fieldPath
1060+
)
1061+
schema_field_properties[schema_field_urn] = (
1062+
search_field_name
1063+
)
9841064
if "Relationship" in json_dict:
9851065
relationship_info = json_dict["Relationship"]
9861066
# detect if we have relationship specified at leaf level or thru path specs
@@ -1064,6 +1144,21 @@ def strip_types(field_path: str) -> str:
10641144
],
10651145
)
10661146

1147+
# Emit structured properties for schema fields
1148+
property_urn = "urn:li:structuredProperty:com.datahub.metadata.searchFieldName"
1149+
for schema_field_urn, search_field_name in schema_field_properties.items():
1150+
yield MetadataChangeProposalWrapper(
1151+
entityUrn=schema_field_urn,
1152+
aspect=StructuredPropertiesClass(
1153+
properties=[
1154+
StructuredPropertyValueAssignmentClass(
1155+
propertyUrn=property_urn,
1156+
values=[search_field_name],
1157+
)
1158+
]
1159+
),
1160+
)
1161+
10671162

10681163
@dataclass
10691164
class EntityAspectName:
@@ -1256,8 +1351,15 @@ def generate( # noqa: C901
12561351
logger.error(f"Failed to generate lineage JSON: {e}")
12571352
raise
12581353

1354+
# Create structured property for search field names first
1355+
logger.info("Creating structured property for search field names")
1356+
structured_property_mcps = create_search_field_name_property()
1357+
12591358
relationship_graph = RelationshipGraph()
1260-
mcps = list(generate_stitched_record(relationship_graph))
1359+
entity_mcps = list(generate_stitched_record(relationship_graph))
1360+
1361+
# Combine MCPs with structured property first
1362+
mcps = structured_property_mcps + entity_mcps
12611363

12621364
shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True)
12631365
entity_names = [(x, entity_registry[x]) for x in generated_documentation]

0 commit comments

Comments
 (0)