@@ -156,123 +156,133 @@ def add_schema(self, schema: SchemaDTO):
156156 def add_user (self , user : UserDTO ):
157157 self ._add (self ._users , user )
158158
159- def _get_location (self , location_key : tuple ) -> LocationDTO :
159+ def get_location (self , location_key : tuple ) -> LocationDTO :
160160 return self ._locations [location_key ]
161161
162- def _get_schema (self , schema_key : tuple ) -> SchemaDTO :
162+ def get_schema (self , schema_key : tuple ) -> SchemaDTO :
163163 return self ._schemas [schema_key ]
164164
165- def _get_user (self , user_key : tuple ) -> UserDTO :
165+ def get_user (self , user_key : tuple ) -> UserDTO :
166166 return self ._users [user_key ]
167167
168- def _get_dataset (self , dataset_key : tuple ) -> DatasetDTO :
168+ def get_dataset (self , dataset_key : tuple ) -> DatasetDTO :
169169 dataset = self ._datasets [dataset_key ]
170- dataset .location = self ._get_location (dataset .location .unique_key )
170+ dataset .location = self .get_location (dataset .location .unique_key )
171171 return dataset
172172
173- def _get_dataset_symlink (self , dataset_symlink_key : tuple ) -> DatasetSymlinkDTO :
173+ def get_dataset_symlink (self , dataset_symlink_key : tuple ) -> DatasetSymlinkDTO :
174174 dataset_symlink = self ._dataset_symlinks [dataset_symlink_key ]
175- dataset_symlink .from_dataset = self ._get_dataset (dataset_symlink .from_dataset .unique_key )
176- dataset_symlink .to_dataset = self ._get_dataset (dataset_symlink .to_dataset .unique_key )
175+ dataset_symlink .from_dataset = self .get_dataset (dataset_symlink .from_dataset .unique_key )
176+ dataset_symlink .to_dataset = self .get_dataset (dataset_symlink .to_dataset .unique_key )
177177 return dataset_symlink
178178
179- def _get_job (self , job_key : tuple ) -> JobDTO :
179+ def get_job (self , job_key : tuple ) -> JobDTO :
180180 job = self ._jobs [job_key ]
181- job .location = self ._get_location (job .location .unique_key )
181+ job .location = self .get_location (job .location .unique_key )
182182 return job
183183
184- def _get_run (self , run_key : tuple ) -> RunDTO :
184+ def get_run (self , run_key : tuple ) -> RunDTO :
185185 run = self ._runs [run_key ]
186- run .job = self ._get_job (run .job .unique_key )
186+ run .job = self .get_job (run .job .unique_key )
187187 if run .parent_run :
188- run .parent_run = self ._get_run (run .parent_run .unique_key )
188+ run .parent_run = self .get_run (run .parent_run .unique_key )
189189 if run .user :
190- run .user = self ._get_user (run .user .unique_key )
190+ run .user = self .get_user (run .user .unique_key )
191191 return run
192192
193- def _get_operation (self , operation_key : tuple ) -> OperationDTO :
193+ def get_operation (self , operation_key : tuple ) -> OperationDTO :
194194 operation = self ._operations [operation_key ]
195- operation .run = self ._get_run (operation .run .unique_key )
195+ operation .run = self .get_run (operation .run .unique_key )
196196 return operation
197197
198- def _get_input (self , input_key : tuple ) -> InputDTO :
198+ def get_input (self , input_key : tuple ) -> InputDTO :
199199 input_ = self ._inputs [input_key ]
200- input_ .operation = self ._get_operation (input_ .operation .unique_key )
201- input_ .dataset = self ._get_dataset (input_ .dataset .unique_key )
200+ input_ .operation = self .get_operation (input_ .operation .unique_key )
201+ input_ .dataset = self .get_dataset (input_ .dataset .unique_key )
202202 if input_ .schema :
203- input_ .schema = self ._get_schema (input_ .schema .unique_key )
203+ input_ .schema = self .get_schema (input_ .schema .unique_key )
204204 return input_
205205
206- def _get_output (self , output_key : tuple ) -> OutputDTO :
206+ def get_output (self , output_key : tuple ) -> OutputDTO :
207207 output = self ._outputs [output_key ]
208- output .operation = self ._get_operation (output .operation .unique_key )
209- output .dataset = self ._get_dataset (output .dataset .unique_key )
208+ output .operation = self .get_operation (output .operation .unique_key )
209+ output .dataset = self .get_dataset (output .dataset .unique_key )
210210 if output .schema :
211- output .schema = self ._get_schema (output .schema .unique_key )
211+ output .schema = self .get_schema (output .schema .unique_key )
212212 return output
213213
214- def _get_column_lineage (self , output_key : tuple ) -> ColumnLineageDTO :
214+ def get_column_lineage (self , output_key : tuple ) -> ColumnLineageDTO :
215215 lineage = self ._column_lineage [output_key ]
216- lineage .operation = self ._get_operation (lineage .operation .unique_key )
217- lineage .source_dataset = self ._get_dataset (lineage .source_dataset .unique_key )
218- lineage .target_dataset = self ._get_dataset (lineage .target_dataset .unique_key )
216+ lineage .operation = self .get_operation (lineage .operation .unique_key )
217+ lineage .source_dataset = self .get_dataset (lineage .source_dataset .unique_key )
218+ lineage .target_dataset = self .get_dataset (lineage .target_dataset .unique_key )
219219 return lineage
220220
221221 def locations (self ) -> list [LocationDTO ]:
222- return list (map (self ._get_location , self ._locations ))
222+ return list (map (self .get_location , self ._locations ))
223223
224224 def datasets (self ) -> list [DatasetDTO ]:
225- return list (map (self ._get_dataset , self ._datasets ))
225+ return list (map (self .get_dataset , self ._datasets ))
226226
227227 def dataset_symlinks (self ) -> list [DatasetSymlinkDTO ]:
228- return list (map (self ._get_dataset_symlink , self ._dataset_symlinks ))
228+ return list (map (self .get_dataset_symlink , self ._dataset_symlinks ))
229229
230230 def jobs (self ) -> list [JobDTO ]:
231- return list (map (self ._get_job , self ._jobs ))
231+ return list (map (self .get_job , self ._jobs ))
232232
233233 def runs (self ) -> list [RunDTO ]:
234- return list (map (self ._get_run , self ._runs ))
234+ return list (map (self .get_run , self ._runs ))
235235
236236 def operations (self ) -> list [OperationDTO ]:
237- return list (map (self ._get_operation , self ._operations ))
237+ return list (map (self .get_operation , self ._operations ))
238238
239239 def inputs (self ) -> list [InputDTO ]:
240- return list (map (self ._get_input , self ._inputs ))
240+ return list (map (self .get_input , self ._inputs ))
241241
242242 def outputs (self ) -> list [OutputDTO ]:
243- return list (map (self ._get_output , self ._outputs ))
243+ return list (map (self .get_output , self ._outputs ))
244244
245245 def column_lineage (self ) -> list [ColumnLineageDTO ]:
246- return list (map (self ._get_column_lineage , self ._column_lineage ))
246+ return list (map (self .get_column_lineage , self ._column_lineage ))
247247
248248 def schemas (self ) -> list [SchemaDTO ]:
249- return list (map (self ._get_schema , self ._schemas ))
249+ return list (map (self .get_schema , self ._schemas ))
250250
251251 def users (self ) -> list [UserDTO ]:
252- return list (map (self ._get_user , self ._users ))
252+ return list (map (self .get_user , self ._users ))
253253
254254
255255def extract_batch (events : list [OpenLineageRunEvent ]) -> BatchExtractionResult :
256256 result = BatchExtractionResult ()
257+ dataset_cache : dict [tuple [str , str ], DatasetDTO ] = {}
257258
258259 for event in events :
259260 if event .job .facets .jobType and event .job .facets .jobType .jobType == OpenLineageJobType .JOB :
260261 operation = extract_operation (event )
261262 result .add_operation (operation )
263+
262264 for input_dataset in event .inputs :
263- input_ , symlinks = extract_input (operation , input_dataset )
264- result .add_input (input_ )
265- for symlink in symlinks :
266- result .add_dataset_symlink (symlink )
265+ input_dto , symlink_dtos = extract_input (operation , input_dataset )
266+
267+ result .add_input (input_dto )
268+ dataset_dto_cache_key = (input_dataset .namespace , input_dataset .name )
269+ dataset_cache [dataset_dto_cache_key ] = result .get_dataset (input_dto .dataset .unique_key )
270+
271+ for symlink_dto in symlink_dtos :
272+ result .add_dataset_symlink (symlink_dto )
267273
268274 for output_dataset in event .outputs :
269- output , symlinks = extract_output (operation , output_dataset )
270- result .add_output (output )
271- for symlink in symlinks :
272- result .add_dataset_symlink (symlink )
275+ output_dto , symlink_dtos = extract_output (operation , output_dataset )
276+
277+ result .add_output (output_dto )
278+ dataset_dto_cache_key = (output_dataset .namespace , output_dataset .name )
279+ dataset_cache [dataset_dto_cache_key ] = result .get_dataset (output_dto .dataset .unique_key )
280+
281+ for symlink_dto in symlink_dtos :
282+ result .add_dataset_symlink (symlink_dto )
273283
274284 for dataset in event .inputs + event .outputs :
275- column_lineage = extract_column_lineage (operation , dataset )
285+ column_lineage = extract_column_lineage (operation , dataset , dataset_cache )
276286 for item in column_lineage :
277287 result .add_column_lineage (item )
278288
0 commit comments