|
20 | 20 |
|
21 | 21 | logger = logging.getLogger(__name__) |
22 | 22 |
|
| 23 | +TRANSFORMATION_TYPE_DIRECT = "DIRECT" |
| 24 | + |
23 | 25 | TRANSFORMATION_SUBTYPE_MAP_MASKING = { |
24 | 26 | "TRANSFORMATION": DatasetColumnRelationTypeDTO.TRANSFORMATION_MASKING, |
25 | 27 | "AGGREGATION": DatasetColumnRelationTypeDTO.AGGREGATION_MASKING, |
@@ -77,51 +79,65 @@ def extract_column_lineage( |
77 | 79 | # Grouping column lineage by source+target dataset. This is unique combination within operation, |
78 | 80 | # so we can use it to generate the same fingerprint for all dataset column relations |
79 | 81 | datasets = {target_dataset_dto.unique_key: target_dataset_dto} |
80 | | - dataset_column_relations = defaultdict(list) |
| 82 | + dataset_column_relations: dict[tuple, dict[tuple, DatasetColumnRelationDTO]] = defaultdict(dict) |
81 | 83 |
|
82 | 84 | # direct lineage (source_column -> target_column) |
83 | 85 | for field, raw_column_lineage in target_dataset.facets.columnLineage.fields.items(): |
84 | 86 | for input_field in raw_column_lineage.inputFields: |
85 | 87 | source_dataset_dto = resolve_dataset_ref(input_field, dataset_cache) |
86 | 88 | datasets[source_dataset_dto.unique_key] = source_dataset_dto |
87 | 89 |
|
88 | | - column_lineage_key = (source_dataset_dto.unique_key, target_dataset_dto.unique_key) |
89 | | - for transformation in input_field.transformations: |
90 | | - # OL integration for Spark before v1.23 (or with columnLineage.datasetLineageEnabled=false, which is still default) # noqa: E501 |
91 | | - # produced INDIRECT lineage for each combination source_column x target_column, |
92 | | - # which is amlost the cartesian join. It is VERY expensive to handle, just ignore. |
93 | | - # See https://github.com/OpenLineage/OpenLineage/pull/3097 |
94 | | - if transformation.type == "INDIRECT": |
95 | | - continue |
| 90 | + dataset_relation_key = (source_dataset_dto.unique_key, target_dataset_dto.unique_key) |
| 91 | + dataset_column_relation = dataset_column_relations[dataset_relation_key] |
96 | 92 |
|
| 93 | + for transformation in input_field.transformations: |
| 94 | + # OL integration for Spark before v1.23 |
| 95 | + # or with columnLineage.datasetLineageEnabled=false (which is still default) |
| 96 | + # produces INDIRECT lineage for each combination source_column x target_column, |
| 97 | + # which is almost a cartesian product. |
| 98 | + # There are a lot of duplicates here, trying to avoid them by merging items immediately. |
97 | 99 | column_relation = DatasetColumnRelationDTO( |
98 | 100 | type=extract_dataset_column_relation_type(transformation), |
99 | 101 | source_column=input_field.field, |
100 | | - target_column=field, |
| 102 | + target_column=field if transformation.type == TRANSFORMATION_TYPE_DIRECT else None, |
101 | 103 | ) |
102 | | - dataset_column_relations[column_lineage_key].append(column_relation) |
| 104 | + column_relation_key = column_relation.unique_key |
| 105 | + |
| 106 | + existing_column_relation = dataset_column_relation.get(column_relation_key) |
| 107 | + if existing_column_relation: |
| 108 | + dataset_column_relation[column_relation_key] = existing_column_relation.merge(column_relation) |
| 109 | + else: |
| 110 | + dataset_column_relation[column_relation_key] = column_relation |
103 | 111 |
|
104 | 112 | # indirect lineage (source_column -> target_dataset), |
105 | 113 | # added to OL since v1.23 and send only when columnLineage.datasetLineageEnabled=true |
106 | 114 | for input_field in target_dataset.facets.columnLineage.dataset: |
107 | 115 | source_dataset_dto = resolve_dataset_ref(input_field, dataset_cache) |
108 | 116 | datasets[source_dataset_dto.unique_key] = source_dataset_dto |
109 | 117 |
|
110 | | - column_lineage_key = (source_dataset_dto.unique_key, target_dataset_dto.unique_key) |
| 118 | + dataset_relation_key = (source_dataset_dto.unique_key, target_dataset_dto.unique_key) |
| 119 | + dataset_column_relation = dataset_column_relations[dataset_relation_key] |
| 120 | + |
111 | 121 | for transformation in input_field.transformations: |
112 | 122 | column_relation = DatasetColumnRelationDTO( |
113 | 123 | type=extract_dataset_column_relation_type(transformation), |
114 | 124 | source_column=input_field.field, |
115 | 125 | ) |
116 | | - dataset_column_relations[column_lineage_key].append(column_relation) |
| 126 | + column_relation_key = column_relation.unique_key |
| 127 | + |
| 128 | + existing_column_relation = dataset_column_relation.get(column_relation_key) |
| 129 | + if existing_column_relation: |
| 130 | + dataset_column_relation[column_relation_key] = existing_column_relation.merge(column_relation) |
| 131 | + else: |
| 132 | + dataset_column_relation[column_relation_key] = column_relation |
117 | 133 |
|
118 | 134 | # merge results into DTO objects |
119 | 135 | return [ |
120 | 136 | ColumnLineageDTO( |
121 | 137 | operation=operation, |
122 | 138 | source_dataset=datasets[source_dataset_dto_key], |
123 | 139 | target_dataset=datasets[target_dataset_dto_key], |
124 | | - dataset_column_relations=relations, |
| 140 | + dataset_column_relations=list(relations.values()), |
125 | 141 | ) |
126 | 142 | for (source_dataset_dto_key, target_dataset_dto_key), relations in dataset_column_relations.items() |
127 | 143 | if dataset_column_relations |
|
0 commit comments