@@ -188,6 +188,11 @@ def __init__(self, graph, *args, **kwargs):
188188 source document regardless as to weather it matches an
189189 edge or not. (advanced usage)
190190 :type copy_from_doc: bool
191+
192+ Note: Prefixes can be defined at the node level using:
193+ graph.add_node("chebi", prefix="CHEBI")
194+ When an identifier is converted to a node with a prefix attribute,
195+ the prefix will be automatically added to the _id.
191196 """
192197 if not isinstance (graph , nx .DiGraph ):
193198 raise ValueError ("key_lookup configuration error: graph must be of type nx.DiGraph" )
@@ -198,6 +203,29 @@ def __init__(self, graph, *args, **kwargs):
198203 super (DataTransformMDB , self ).__init__ (* args , ** kwargs )
199204 self ._precompute_paths ()
200205
206+ def _apply_prefix (self , identifier , output_type ):
207+ """
208+ Apply prefix to identifier based on output type.
209+
210+ Prefixes are defined as node attributes in the graph:
211+ graph.add_node("chebi", prefix="CHEBI")
212+
213+ :param identifier: The identifier value to potentially prefix
214+ :param output_type: The output type to check for prefix
215+ :return: The identifier with prefix applied if configured
216+ """
217+ # Check if the node has a prefix attribute
218+ if output_type in self .graph .nodes ():
219+ node_data = self .graph .nodes [output_type ]
220+ if 'prefix' in node_data :
221+ prefix = node_data ['prefix' ]
222+ identifier_str = str (identifier )
223+ # Only add prefix if it's not already there
224+ if not identifier_str .startswith (prefix + ":" ):
225+ return f"{ prefix } :{ identifier_str } "
226+
227+ return str (identifier )
228+
201229 def _valid_input_type (self , input_type ):
202230 return input_type .lower () in self .graph .nodes ()
203231
@@ -292,7 +320,7 @@ def key_lookup_batch(self, batchiter):
292320 (hit_lst , miss_lst ) = self .travel (input_type , output_type , miss_lst )
293321 # or if copy is allowed, we get the value from the doc
294322 elif self .copy_from_doc :
295- (hit_lst , miss_lst ) = self ._copy (input_type , miss_lst )
323+ (hit_lst , miss_lst ) = self ._copy (input_type , output_type , miss_lst )
296324 else :
297325 (hit_lst , miss_lst ) = self .travel (input_type , output_type , miss_lst )
298326
@@ -305,15 +333,15 @@ def key_lookup_batch(self, batchiter):
305333 for doc in miss_lst :
306334 yield doc
307335
308- def _copy (self , input_type , doc_lst ):
336+ def _copy (self , input_type , output_type , doc_lst ):
309337 """Copy ids in the case where input_type == output_type"""
310338 hit_lst = []
311339 miss_lst = []
312340 for doc in doc_lst :
313341 val = nested_lookup (doc , input_type [1 ])
314342 if val :
315- # ensure _id is always a str
316- doc ["_id" ] = str (val )
343+ # ensure _id is always a str and apply prefix if configured
344+ doc ["_id" ] = self . _apply_prefix (val , output_type )
317345 hit_lst .append (doc )
318346 # retain debug information if available (assumed dt_debug already in place)
319347 if self .debug :
@@ -371,8 +399,8 @@ def _build_hit_miss_lsts(doc_lst, id_strct, debug):
371399 value = nested_lookup (doc , input_type [1 ])
372400 for lookup_id in id_strct .find_left (value ):
373401 new_doc = copy .deepcopy (doc )
374- # ensure _id is always a str
375- new_doc ["_id" ] = str (lookup_id )
402+ # ensure _id is always a str and apply prefix if configured
403+ new_doc ["_id" ] = self . _apply_prefix (lookup_id , target )
376404 # capture debug information
377405 if debug :
378406 new_doc ["dt_debug" ]["start_field" ] = input_type [1 ]
0 commit comments