Skip to content

Commit 03e2acf

Browse files
committed
add prefix option to DataTransformMDB class
1 parent 1f2267c commit 03e2acf

File tree

1 file changed

+34
-6
lines changed

1 file changed

+34
-6
lines changed

biothings/hub/datatransform/datatransform_mdb.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,11 @@ def __init__(self, graph, *args, **kwargs):
188188
source document regardless as to weather it matches an
189189
edge or not. (advanced usage)
190190
:type copy_from_doc: bool
191+
192+
Note: Prefixes can be defined at the node level using:
193+
graph.add_node("chebi", prefix="CHEBI")
194+
When an identifier is converted to a node with a prefix attribute,
195+
the prefix will be automatically added to the _id.
191196
"""
192197
if not isinstance(graph, nx.DiGraph):
193198
raise ValueError("key_lookup configuration error: graph must be of type nx.DiGraph")
@@ -198,6 +203,29 @@ def __init__(self, graph, *args, **kwargs):
198203
super(DataTransformMDB, self).__init__(*args, **kwargs)
199204
self._precompute_paths()
200205

206+
def _apply_prefix(self, identifier, output_type):
207+
"""
208+
Apply prefix to identifier based on output type.
209+
210+
Prefixes are defined as node attributes in the graph:
211+
graph.add_node("chebi", prefix="CHEBI")
212+
213+
:param identifier: The identifier value to potentially prefix
214+
:param output_type: The output type to check for prefix
215+
:return: The identifier with prefix applied if configured
216+
"""
217+
# Check if the node has a prefix attribute
218+
if output_type in self.graph.nodes():
219+
node_data = self.graph.nodes[output_type]
220+
if 'prefix' in node_data:
221+
prefix = node_data['prefix']
222+
identifier_str = str(identifier)
223+
# Only add prefix if it's not already there
224+
if not identifier_str.startswith(prefix + ":"):
225+
return f"{prefix}:{identifier_str}"
226+
227+
return str(identifier)
228+
201229
def _valid_input_type(self, input_type):
202230
return input_type.lower() in self.graph.nodes()
203231

@@ -292,7 +320,7 @@ def key_lookup_batch(self, batchiter):
292320
(hit_lst, miss_lst) = self.travel(input_type, output_type, miss_lst)
293321
# or if copy is allowed, we get the value from the doc
294322
elif self.copy_from_doc:
295-
(hit_lst, miss_lst) = self._copy(input_type, miss_lst)
323+
(hit_lst, miss_lst) = self._copy(input_type, output_type, miss_lst)
296324
else:
297325
(hit_lst, miss_lst) = self.travel(input_type, output_type, miss_lst)
298326

@@ -305,15 +333,15 @@ def key_lookup_batch(self, batchiter):
305333
for doc in miss_lst:
306334
yield doc
307335

308-
def _copy(self, input_type, doc_lst):
336+
def _copy(self, input_type, output_type, doc_lst):
309337
"""Copy ids in the case where input_type == output_type"""
310338
hit_lst = []
311339
miss_lst = []
312340
for doc in doc_lst:
313341
val = nested_lookup(doc, input_type[1])
314342
if val:
315-
# ensure _id is always a str
316-
doc["_id"] = str(val)
343+
# ensure _id is always a str and apply prefix if configured
344+
doc["_id"] = self._apply_prefix(val, output_type)
317345
hit_lst.append(doc)
318346
# retain debug information if available (assumed dt_debug already in place)
319347
if self.debug:
@@ -371,8 +399,8 @@ def _build_hit_miss_lsts(doc_lst, id_strct, debug):
371399
value = nested_lookup(doc, input_type[1])
372400
for lookup_id in id_strct.find_left(value):
373401
new_doc = copy.deepcopy(doc)
374-
# ensure _id is always a str
375-
new_doc["_id"] = str(lookup_id)
402+
# ensure _id is always a str and apply prefix if configured
403+
new_doc["_id"] = self._apply_prefix(lookup_id, target)
376404
# capture debug information
377405
if debug:
378406
new_doc["dt_debug"]["start_field"] = input_type[1]

0 commit comments

Comments
 (0)