[DOP-23708] Cache datasets for column lineage extraction

dolfinus · dolfinus · commit ca661d80ead7 · 2025-03-07T15:22:11.000+03:00
diff --git a/data_rentgen/consumer/extractors/__init__.py b/data_rentgen/consumer/extractors/__init__.py
@@ -9,10 +9,10 @@
     extract_dataset_and_symlinks,
 )
 from data_rentgen.consumer.extractors.input import extract_input
-from data_rentgen.consumer.extractors.job import extract_job
+from data_rentgen.consumer.extractors.job import extract_job, extract_parent_job
 from data_rentgen.consumer.extractors.operation import extract_operation
 from data_rentgen.consumer.extractors.output import extract_output
-from data_rentgen.consumer.extractors.run import extract_run, extract_run_minimal
+from data_rentgen.consumer.extractors.run import extract_parent_run, extract_run
 from data_rentgen.consumer.extractors.schema import extract_schema
 
 __all__ = [
@@ -26,7 +26,8 @@
     "extract_job",
     "extract_operation",
     "extract_output",
+    "extract_parent_job",
+    "extract_parent_run",
     "extract_run",
-    "extract_run_minimal",
     "extract_schema",
 ]
diff --git a/data_rentgen/consumer/extractors/batch.py b/data_rentgen/consumer/extractors/batch.py
@@ -156,123 +156,133 @@ def add_schema(self, schema: SchemaDTO):
     def add_user(self, user: UserDTO):
         self._add(self._users, user)
 
-    def _get_location(self, location_key: tuple) -> LocationDTO:
+    def get_location(self, location_key: tuple) -> LocationDTO:
         return self._locations[location_key]
 
-    def _get_schema(self, schema_key: tuple) -> SchemaDTO:
+    def get_schema(self, schema_key: tuple) -> SchemaDTO:
         return self._schemas[schema_key]
 
-    def _get_user(self, user_key: tuple) -> UserDTO:
+    def get_user(self, user_key: tuple) -> UserDTO:
         return self._users[user_key]
 
-    def _get_dataset(self, dataset_key: tuple) -> DatasetDTO:
+    def get_dataset(self, dataset_key: tuple) -> DatasetDTO:
         dataset = self._datasets[dataset_key]
-        dataset.location = self._get_location(dataset.location.unique_key)
+        dataset.location = self.get_location(dataset.location.unique_key)
         return dataset
 
-    def _get_dataset_symlink(self, dataset_symlink_key: tuple) -> DatasetSymlinkDTO:
+    def get_dataset_symlink(self, dataset_symlink_key: tuple) -> DatasetSymlinkDTO:
         dataset_symlink = self._dataset_symlinks[dataset_symlink_key]
-        dataset_symlink.from_dataset = self._get_dataset(dataset_symlink.from_dataset.unique_key)
-        dataset_symlink.to_dataset = self._get_dataset(dataset_symlink.to_dataset.unique_key)
+        dataset_symlink.from_dataset = self.get_dataset(dataset_symlink.from_dataset.unique_key)
+        dataset_symlink.to_dataset = self.get_dataset(dataset_symlink.to_dataset.unique_key)
         return dataset_symlink
 
-    def _get_job(self, job_key: tuple) -> JobDTO:
+    def get_job(self, job_key: tuple) -> JobDTO:
         job = self._jobs[job_key]
-        job.location = self._get_location(job.location.unique_key)
+        job.location = self.get_location(job.location.unique_key)
         return job
 
-    def _get_run(self, run_key: tuple) -> RunDTO:
+    def get_run(self, run_key: tuple) -> RunDTO:
         run = self._runs[run_key]
-        run.job = self._get_job(run.job.unique_key)
+        run.job = self.get_job(run.job.unique_key)
         if run.parent_run:
-            run.parent_run = self._get_run(run.parent_run.unique_key)
+            run.parent_run = self.get_run(run.parent_run.unique_key)
         if run.user:
-            run.user = self._get_user(run.user.unique_key)
+            run.user = self.get_user(run.user.unique_key)
         return run
 
-    def _get_operation(self, operation_key: tuple) -> OperationDTO:
+    def get_operation(self, operation_key: tuple) -> OperationDTO:
         operation = self._operations[operation_key]
-        operation.run = self._get_run(operation.run.unique_key)
+        operation.run = self.get_run(operation.run.unique_key)
         return operation
 
-    def _get_input(self, input_key: tuple) -> InputDTO:
+    def get_input(self, input_key: tuple) -> InputDTO:
         input_ = self._inputs[input_key]
-        input_.operation = self._get_operation(input_.operation.unique_key)
-        input_.dataset = self._get_dataset(input_.dataset.unique_key)
+        input_.operation = self.get_operation(input_.operation.unique_key)
+        input_.dataset = self.get_dataset(input_.dataset.unique_key)
         if input_.schema:
-            input_.schema = self._get_schema(input_.schema.unique_key)
+            input_.schema = self.get_schema(input_.schema.unique_key)
         return input_
 
-    def _get_output(self, output_key: tuple) -> OutputDTO:
+    def get_output(self, output_key: tuple) -> OutputDTO:
         output = self._outputs[output_key]
-        output.operation = self._get_operation(output.operation.unique_key)
-        output.dataset = self._get_dataset(output.dataset.unique_key)
+        output.operation = self.get_operation(output.operation.unique_key)
+        output.dataset = self.get_dataset(output.dataset.unique_key)
         if output.schema:
-            output.schema = self._get_schema(output.schema.unique_key)
+            output.schema = self.get_schema(output.schema.unique_key)
         return output
 
-    def _get_column_lineage(self, output_key: tuple) -> ColumnLineageDTO:
+    def get_column_lineage(self, output_key: tuple) -> ColumnLineageDTO:
         lineage = self._column_lineage[output_key]
-        lineage.operation = self._get_operation(lineage.operation.unique_key)
-        lineage.source_dataset = self._get_dataset(lineage.source_dataset.unique_key)
-        lineage.target_dataset = self._get_dataset(lineage.target_dataset.unique_key)
+        lineage.operation = self.get_operation(lineage.operation.unique_key)
+        lineage.source_dataset = self.get_dataset(lineage.source_dataset.unique_key)
+        lineage.target_dataset = self.get_dataset(lineage.target_dataset.unique_key)
         return lineage
 
     def locations(self) -> list[LocationDTO]:
-        return list(map(self._get_location, self._locations))
+        return list(map(self.get_location, self._locations))
 
     def datasets(self) -> list[DatasetDTO]:
-        return list(map(self._get_dataset, self._datasets))
+        return list(map(self.get_dataset, self._datasets))
 
     def dataset_symlinks(self) -> list[DatasetSymlinkDTO]:
-        return list(map(self._get_dataset_symlink, self._dataset_symlinks))
+        return list(map(self.get_dataset_symlink, self._dataset_symlinks))
 
     def jobs(self) -> list[JobDTO]:
-        return list(map(self._get_job, self._jobs))
+        return list(map(self.get_job, self._jobs))
 
     def runs(self) -> list[RunDTO]:
-        return list(map(self._get_run, self._runs))
+        return list(map(self.get_run, self._runs))
 
     def operations(self) -> list[OperationDTO]:
-        return list(map(self._get_operation, self._operations))
+        return list(map(self.get_operation, self._operations))
 
     def inputs(self) -> list[InputDTO]:
-        return list(map(self._get_input, self._inputs))
+        return list(map(self.get_input, self._inputs))
 
     def outputs(self) -> list[OutputDTO]:
-        return list(map(self._get_output, self._outputs))
+        return list(map(self.get_output, self._outputs))
 
     def column_lineage(self) -> list[ColumnLineageDTO]:
-        return list(map(self._get_column_lineage, self._column_lineage))
+        return list(map(self.get_column_lineage, self._column_lineage))
 
     def schemas(self) -> list[SchemaDTO]:
-        return list(map(self._get_schema, self._schemas))
+        return list(map(self.get_schema, self._schemas))
 
     def users(self) -> list[UserDTO]:
-        return list(map(self._get_user, self._users))
+        return list(map(self.get_user, self._users))
 
 
 def extract_batch(events: list[OpenLineageRunEvent]) -> BatchExtractionResult:
     result = BatchExtractionResult()
+    dataset_cache: dict[tuple[str, str], DatasetDTO] = {}
 
     for event in events:
         if event.job.facets.jobType and event.job.facets.jobType.jobType == OpenLineageJobType.JOB:
             operation = extract_operation(event)
             result.add_operation(operation)
+
             for input_dataset in event.inputs:
-                input_, symlinks = extract_input(operation, input_dataset)
-                result.add_input(input_)
-                for symlink in symlinks:
-                    result.add_dataset_symlink(symlink)
+                input_dto, symlink_dtos = extract_input(operation, input_dataset)
+
+                result.add_input(input_dto)
+                dataset_dto_cache_key = (input_dataset.namespace, input_dataset.name)
+                dataset_cache[dataset_dto_cache_key] = result.get_dataset(input_dto.dataset.unique_key)
+
+                for symlink_dto in symlink_dtos:
+                    result.add_dataset_symlink(symlink_dto)
 
             for output_dataset in event.outputs:
-                output, symlinks = extract_output(operation, output_dataset)
-                result.add_output(output)
-                for symlink in symlinks:
-                    result.add_dataset_symlink(symlink)
+                output_dto, symlink_dtos = extract_output(operation, output_dataset)
+
+                result.add_output(output_dto)
+                dataset_dto_cache_key = (output_dataset.namespace, output_dataset.name)
+                dataset_cache[dataset_dto_cache_key] = result.get_dataset(output_dto.dataset.unique_key)
+
+                for symlink_dto in symlink_dtos:
+                    result.add_dataset_symlink(symlink_dto)
 
             for dataset in event.inputs + event.outputs:
-                column_lineage = extract_column_lineage(operation, dataset)
+                column_lineage = extract_column_lineage(operation, dataset, dataset_cache)
                 for item in column_lineage:
                     result.add_column_lineage(item)
 
diff --git a/data_rentgen/consumer/extractors/column_lineage.py b/data_rentgen/consumer/extractors/column_lineage.py
@@ -4,16 +4,18 @@
 import logging
 from collections import defaultdict
 
-from data_rentgen.consumer.extractors.dataset import extract_dataset
+from data_rentgen.consumer.extractors.dataset import extract_dataset_ref
 from data_rentgen.consumer.openlineage.dataset import OpenLineageDataset
 from data_rentgen.consumer.openlineage.dataset_facets.column_lineage import (
+    OpenLineageColumnLineageDatasetFacetFieldRef,
     OpenLineageColumnLineageDatasetFacetFieldTransformation,
 )
 from data_rentgen.dto import (
     ColumnLineageDTO,
     DatasetColumnRelationDTO,
     DatasetColumnRelationTypeDTO,
 )
+from data_rentgen.dto.dataset import DatasetDTO
 from data_rentgen.dto.operation import OperationDTO
 
 logger = logging.getLogger(__name__)
@@ -49,11 +51,29 @@ def extract_dataset_column_relation_type(
     return result or DatasetColumnRelationTypeDTO.UNKNOWN
 
 
-def extract_column_lineage(operation: OperationDTO, target_dataset: OpenLineageDataset) -> list[ColumnLineageDTO]:
-    target_dataset_dto = extract_dataset(target_dataset)
+def resolve_dataset_ref(
+    dataset_ref: OpenLineageDataset | OpenLineageColumnLineageDatasetFacetFieldRef,
+    dataset_dto_cache: dict[tuple[str, str], DatasetDTO],
+):
+    # extracting dataset for every column is expensive. cache it as much as we can
+    dataset_cache_key = (dataset_ref.namespace, dataset_ref.name)
+    if dataset_cache_key not in dataset_dto_cache:
+        # https://github.com/OpenLineage/OpenLineage/issues/2938#issuecomment-2320377260
+        dataset_dto_cache[dataset_cache_key] = extract_dataset_ref(dataset_ref)
+    return dataset_dto_cache[dataset_cache_key]
+
+
+def extract_column_lineage(
+    operation: OperationDTO,
+    target_dataset: OpenLineageDataset,
+    dataset_cache: dict[tuple[str, str], DatasetDTO] | None = None,
+) -> list[ColumnLineageDTO]:
     if not target_dataset.facets.columnLineage:
         return []
 
+    dataset_cache = dataset_cache or {}
+    target_dataset_dto = resolve_dataset_ref(target_dataset, dataset_cache)
+
     # Grouping column lineage by source+target dataset. This is unique combination within operation,
     # so we can use it to generate the same fingerprint for all dataset column relations
     datasets = {target_dataset_dto.unique_key: target_dataset_dto}
@@ -62,7 +82,7 @@ def extract_column_lineage(operation: OperationDTO, target_dataset: OpenLineageD
     # direct lineage (source_column -> target_column)
     for field, raw_column_lineage in target_dataset.facets.columnLineage.fields.items():
         for input_field in raw_column_lineage.inputFields:
-            source_dataset_dto = extract_dataset(input_field)
+            source_dataset_dto = resolve_dataset_ref(input_field, dataset_cache)
             datasets[source_dataset_dto.unique_key] = source_dataset_dto
 
             column_lineage_key = (source_dataset_dto.unique_key, target_dataset_dto.unique_key)
@@ -84,7 +104,7 @@ def extract_column_lineage(operation: OperationDTO, target_dataset: OpenLineageD
     # indirect lineage (source_column -> target_dataset),
     # added to OL since v1.23 and send only when columnLineage.datasetLineageEnabled=true
     for input_field in target_dataset.facets.columnLineage.dataset:
-        source_dataset_dto = extract_dataset(input_field)
+        source_dataset_dto = resolve_dataset_ref(input_field, dataset_cache)
         datasets[source_dataset_dto.unique_key] = source_dataset_dto
 
         column_lineage_key = (source_dataset_dto.unique_key, target_dataset_dto.unique_key)
diff --git a/data_rentgen/consumer/extractors/dataset.py b/data_rentgen/consumer/extractors/dataset.py
@@ -58,16 +58,28 @@ def connect_dataset_with_symlinks(
     return sorted(result, key=lambda x: x.type)
 
 
-def extract_dataset(dataset: OpenLineageDatasetLike) -> DatasetDTO:
-    name_with_partitions = PARTITION_PATH_PATTERN.match(dataset.name)
-    name = name_with_partitions.group(1) if name_with_partitions else dataset.name
+def strip_partitions_from_path(name: str):
+    # convert /some/long/path/with=partition/another=abc to /some/long/path
+    if "=" not in name or "/" not in name:
+        return name
+
+    name_with_partitions = PARTITION_PATH_PATTERN.match(name)
+    return name_with_partitions.group(1) if name_with_partitions else name
+
+
+def extract_dataset_ref(dataset: OpenLineageDatasetLike) -> DatasetDTO:
     return DatasetDTO(
-        name=name,
+        name=strip_partitions_from_path(dataset.name),
         location=extract_dataset_location(dataset),
-        format=extract_dataset_format(dataset),
     )
 
 
+def extract_dataset(dataset: OpenLineageDataset) -> DatasetDTO:
+    dataset_dto = extract_dataset_ref(dataset)
+    dataset_dto.format = extract_dataset_format(dataset)
+    return dataset_dto
+
+
 def extract_dataset_and_symlinks(dataset: OpenLineageDataset) -> tuple[DatasetDTO, list[DatasetSymlinkDTO]]:
     dataset_dto = extract_dataset(dataset)
     if not dataset.facets.symlinks:
@@ -91,7 +103,7 @@ def extract_dataset_and_symlinks(dataset: OpenLineageDataset) -> tuple[DatasetDT
                 "Only the first one will be used for replacement. Symlink name: %s",
                 table_symlinks[0].name,
             )
-        table_dataset_dto = extract_dataset(table_symlinks[0])
+        table_dataset_dto = extract_dataset_ref(table_symlinks[0])
         return (
             table_dataset_dto,
             connect_dataset_with_symlinks(
@@ -103,7 +115,7 @@ def extract_dataset_and_symlinks(dataset: OpenLineageDataset) -> tuple[DatasetDT
 
     symlinks = []
     for symlink_identifier in dataset.facets.symlinks.identifiers:
-        symlink_dto = extract_dataset(symlink_identifier)
+        symlink_dto = extract_dataset_ref(symlink_identifier)
         symlinks.extend(
             connect_dataset_with_symlinks(
                 dataset_dto,
@@ -134,10 +146,7 @@ def extract_dataset_location(dataset: OpenLineageDatasetLike) -> LocationDTO:
     )
 
 
-def extract_dataset_format(dataset: OpenLineageDatasetLike) -> str | None:
-    if isinstance(dataset, (OpenLineageSymlinkIdentifier, OpenLineageColumnLineageDatasetFacetFieldRef)):
-        return None
-
+def extract_dataset_format(dataset: OpenLineageDataset) -> str | None:
     match dataset.facets.storage:
         case OpenLineageStorageDatasetFacet(storageLayer="default", fileFormat=file_format):
             # See https://github.com/OpenLineage/OpenLineage/issues/2770
diff --git a/data_rentgen/consumer/extractors/job.py b/data_rentgen/consumer/extractors/job.py
@@ -8,7 +8,14 @@
 from data_rentgen.dto import JobDTO, JobTypeDTO, LocationDTO
 
 
-def extract_job(job: OpenLineageJob | OpenLineageParentJob) -> JobDTO:
+def extract_parent_job(job: OpenLineageParentJob) -> JobDTO:
+    return JobDTO(
+        name=job.name,
+        location=extract_job_location(job),
+    )
+
+
+def extract_job(job: OpenLineageJob) -> JobDTO:
     return JobDTO(
         name=job.name,
         location=extract_job_location(job),
@@ -27,8 +34,8 @@ def extract_job_location(job: OpenLineageJob | OpenLineageParentJob) -> Location
     )
 
 
-def extract_job_type(job: OpenLineageJob | OpenLineageParentJob) -> JobTypeDTO | None:
-    if isinstance(job, OpenLineageJob) and job.facets.jobType:
+def extract_job_type(job: OpenLineageJob) -> JobTypeDTO | None:
+    if job.facets.jobType:
         job_type = job.facets.jobType.jobType
         integration_type = job.facets.jobType.integration
         return JobTypeDTO(f"{integration_type}_{job_type}")
diff --git a/data_rentgen/consumer/extractors/operation.py b/data_rentgen/consumer/extractors/operation.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2024-2025 MTS PJSC
 # SPDX-License-Identifier: Apache-2.0
 
-from data_rentgen.consumer.extractors.run import extract_run_minimal
+from data_rentgen.consumer.extractors.run import extract_parent_run
 from data_rentgen.consumer.openlineage.run_event import (
     OpenLineageRunEvent,
     OpenLineageRunEventType,
@@ -11,7 +11,7 @@
 
 def extract_operation(event: OpenLineageRunEvent) -> OperationDTO:
     # operation always has parent
-    run = extract_run_minimal(event.run.facets.parent)  # type: ignore[arg-type]
+    run = extract_parent_run(event.run.facets.parent)  # type: ignore[arg-type]
 
     # in some cases, operation name may contain raw SELECT query with newlines
     operation_name = " ".join(line.strip() for line in event.job.name.splitlines()).strip()
diff --git a/data_rentgen/consumer/extractors/run.py b/data_rentgen/consumer/extractors/run.py
diff --git a/tests/test_consumer/test_handlers/test_runs_handler_spark.py b/tests/test_consumer/test_handlers/test_runs_handler_spark.py