Skip to content

Commit f23a034

Browse files
committed
fix: added new mappings; improved new sample processing; minor fixes
1 parent 6562e30 commit f23a034

File tree

1 file changed

+78
-30
lines changed

1 file changed

+78
-30
lines changed

rd3/novelomics_shipment_processing.py

Lines changed: 78 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
"""Solve-RD Novelomics: new shipment file processing
2-
FILE: solverd_novelomics_processing.py
2+
FILE: novelomics_shipment_processing.py
33
AUTHOR: David Ruvolo
44
CREATED: 2022-11-15
5-
MODIFIED: 2024-03-04
6-
PURPOSE: Import new novelomics data
5+
MODIFIED: 2024-03-14
6+
PURPOSE: Process new shipment manifest files - register new subjects and samples
77
STATUS: stable
88
PACKAGES: **see below**
99
COMMENTS: NA
@@ -14,7 +14,7 @@
1414
import operator
1515
import re
1616
from dotenv import load_dotenv
17-
from datatable import dt, f, as_type
17+
from datatable import dt, f
1818
from tqdm import tqdm
1919

2020
from rd3tools.molgenis import Molgenis
@@ -70,6 +70,13 @@ def get_wrapped_values(val: str = None):
7070
shipment_dt = dt.Frame(shipment_raw)
7171
del shipment_dt['_href']
7272

73+
# if you need to delete unprocessed records due to data errors, then
74+
# run the following commands
75+
# rd3_prod.delete_list(
76+
# entity='rd3_portal_novelomics_shipment',
77+
# entities=shipment_dt['molgenis_id'].to_list()[0]
78+
# )
79+
7380
# ///////////////////////////////////////////////////////////////////////////////
7481

7582
# ~ 1 ~
@@ -230,24 +237,31 @@ def get_wrapped_values(val: str = None):
230237
)
231238

232239
# check incoming data, update mappings (if applicable), and rerun
233-
new_tissue_types = dt.unique(
240+
incoming_tissue_types = dt.unique(
234241
shipment_dt[f.tissue_type != None, 'tissue_type']).to_list()[0]
235242

236-
new_tissue_types.sort(key=str.lower)
237-
for value in new_tissue_types:
243+
incoming_tissue_types.sort(key=str.lower)
244+
for value in incoming_tissue_types:
238245
if value.lower() not in tissue_type_mappings:
239246
print(f"Value '{value}' not in tissue type mappings")
240247

248+
# update mappings for cases that are simple recodes
241249
tissue_type_mappings.update({
250+
'adipose tissue': 'Adipose',
242251
'blood': 'Whole Blood',
243252
'cell pellet': 'Cells',
253+
'exelid': 'Eyelid',
254+
'fat skin': 'Adipose - Subcutaneous',
244255
'fibroblasts': 'Cells - Cultured fibroblasts',
256+
"fetus skin": "Foetus",
245257
'fetus': 'Foetus',
246258
'ffpe': 'Tumor',
247259
'heart': 'Heart',
248260
'muscle': 'Muscle - Skeletal',
249261
'pbmc': 'Peripheral Blood Mononuclear Cells',
250-
'whole blood': 'Whole Blood'
262+
'whole blood': 'Whole Blood',
263+
'subcutaneous fat': 'Adipose - Subcutaneous',
264+
'tissue': 'Tissue - unspecified',
251265
})
252266

253267
# ///////////////////////////////////////
@@ -256,11 +270,21 @@ def get_wrapped_values(val: str = None):
256270
# Create anatomical location mappings
257271

258272
if 'anatomical_location' in shipment_dt.names:
259-
print('Checking anatomical location mappings....')
273+
print('Manually check anatomical location mappings!')
260274

261275
# As of 06 Dec 2022, the value 'blood' can be ignored as it cannot be mapped
262-
# to a more specific term
276+
# to a more specific term.
277+
# As of 15 March 2024, terms that do not exist in RD3 will be labelled other.
278+
# The original value will be placed in a new column. This was implemented as
279+
# it isn't possible to determine the exact location from the supplied value.
263280
anatomical_location_mappings = {
281+
'chest': '74964007', # Other
282+
'left': '74964007', # Other
283+
'nose': '74964007', # Other
284+
'retro right auricular area': '74964007', # Other
285+
'right': '74964007', # Other
286+
'scalp': '74964007', # Other
287+
264288
'chest skin': '74160004', # Skin of Chest
265289
'skin scalp': '43067004', # Skin of Scalp
266290
# Entire skin of postauricular region
@@ -295,8 +319,7 @@ def get_wrapped_values(val: str = None):
295319
)['id']
296320

297321
material_types['mappingID'] = dt.Frame([
298-
value.lower()
299-
for value in material_types['id'].to_list()[0]
322+
value.lower() for value in material_types['id'].to_list()[0]
300323
])
301324

302325
material_type_mappings = as_key_pairs(
@@ -350,7 +373,10 @@ def get_wrapped_values(val: str = None):
350373
f"Value '{value}' does not exist in pathological state mappings")
351374

352375
# if there are any values, enter them below ->
353-
# pathological_state_mappings.update({ ... })
376+
pathological_state_mappings.update({
377+
'affected area': 'Affected',
378+
'safe area': 'Normal'
379+
})
354380

355381
# ///////////////////////////////////////////////////////////////////////////////
356382

@@ -396,7 +422,7 @@ def get_wrapped_values(val: str = None):
396422

397423
# recode anatomical location (if available)
398424
if 'anatomical_location' in shipment_dt.names:
399-
shipment_dt['anatomical_location'] = dt.Frame([
425+
shipment_dt['tmp_anatomical_location'] = dt.Frame([
400426
recode_value(
401427
mappings=anatomical_location_mappings,
402428
value=value.lower(),
@@ -405,6 +431,15 @@ def get_wrapped_values(val: str = None):
405431
for value in shipment_dt['anatomical_location'].to_list()[0]
406432
])
407433

434+
# identifier cases with "other"
435+
shipment_dt['anatomical_location_comment'] = dt.Frame([
436+
row[1] if row[0] == "74964007" else None
437+
for row in shipment_dt[:, ['tmp_anatomical_location', 'anatomical_location']].to_tuples()
438+
])
439+
440+
shipment_dt['anatomical_location'] = shipment_dt['tmp_anatomical_location']
441+
del shipment_dt['tmp_anatomical_location']
442+
408443
# recode sample types (i.e., materialType)
409444
shipment_dt['sample_type'] = dt.Frame([
410445
'TISSUE (FFPE)'
@@ -531,7 +566,9 @@ def get_wrapped_values(val: str = None):
531566
'tissue_type': 'tissueType',
532567
'sample_type': 'materialType',
533568
'pathological_state': 'pathological_state',
534-
'tumor_cell_fraction': 'percentageTumorCells'
569+
'tumor_cell_fraction': 'percentageTumorCells',
570+
'anatomical_location': 'anatomicalLocation',
571+
'anatomical_location_comment': 'anatomicalLocationComment',
535572
}
536573

537574
# ///////////////////////////////////////////////////////////////////////////////
@@ -606,15 +643,18 @@ def get_wrapped_values(val: str = None):
606643
curr_sample = dt_as_recordset(samples_dt[f.sampleID == sample_id, :])[0]
607644

608645
# identify records that require manually verification
609-
# for column in columns_with_major_conflicts:
610-
# if (column in incomingSample) and (column in existingSample):
611-
# if incomingSample[column] != existingSample[column]:
612-
# print(f"Incoming sample {id} has conflicting {column} values")
613-
# samples_with_conflicts.append({
614-
# 'incomingValue': incomingSample[column],
615-
# 'existingValue': existingSample[column],
616-
# 'message': f"values in {column} do not match"
617-
# })
646+
for column in columns_with_major_conflicts:
647+
if (column in new_sample) and (column in curr_sample):
648+
if new_sample[column] is not None and curr_sample[column] is not None:
649+
if new_sample[column] not in curr_sample[column]:
650+
new_row = {
651+
'sampleID': sample_id,
652+
'subjectID': new_sample['subjectID'],
653+
}
654+
curr_values = curr_sample[column].split(',')
655+
curr_values.append(new_sample[column])
656+
new_row[column] = ','.join(list(set(curr_values)))
657+
samples_with_conflicts.append(new_row)
618658

619659
# identify columns that can automatically imported
620660
for column in columns_with_minor_conflicts:
@@ -661,14 +701,18 @@ def get_wrapped_values(val: str = None):
661701
# ~ 3a ~
662702
# Import new subject metadata
663703
new_subjects_dt = shipment_dt[
664-
f.isNewSubject, (
704+
f.isNewSubject,
705+
(
665706
f.subjectID,
666707
f.organisation,
667708
f.ERN,
668709
f.partOfRelease,
669710
f.dateRecordCreated,
670-
f.recordCreatedBy)]
711+
f.recordCreatedBy
712+
)
713+
]
671714

715+
# if there are no more subjects, then you can skip to 3b
672716
if not new_subjects_dt.nrows:
673717
print('No subjects to import. You may skip this step')
674718

@@ -687,7 +731,7 @@ def get_wrapped_values(val: str = None):
687731

688732
# ///////////////////////////////////////
689733

690-
# ~ 2b ~
734+
# ~ 3b ~
691735
# Import new sample metadata
692736
new_samples_dt = shipment_dt[
693737
f.isNewSample, (
@@ -698,19 +742,23 @@ def get_wrapped_values(val: str = None):
698742
f.materialType,
699743
f.pathological_state,
700744
f.percentageTumorCells,
745+
f.anatomicalLocation,
746+
f.anatomicalLocationComment,
701747
f.partOfRelease,
702748
f.batch,
703749
f.organisation,
704750
f.ERN,
705751
f.dateRecordCreated,
706752
f.recordCreatedBy)]
707753

754+
new_samples_dt['retracted'] = 'N'
708755
new_samples_dt.names = {'subjectID': 'belongsToSubject'}
756+
709757
rd3_prod.import_dt('solverd_samples', new_samples_dt)
710758

711759
# ///////////////////////////////////////
712760

713-
# ~ 2b ~
761+
# ~ 3c ~
714762
# Update subject release information
715763

716764
existing_subjects_dt = shipment_dt[f.isNewSubject == False, :]
@@ -743,7 +791,7 @@ def get_wrapped_values(val: str = None):
743791

744792
# ///////////////////////////////////////
745793

746-
# ~ 2c ~
794+
# ~ 3d ~
747795
# Update release and batch info in the samples table
748796

749797
if bool(samples_with_updates):
@@ -789,7 +837,7 @@ def get_wrapped_values(val: str = None):
789837

790838
# ///////////////////////////////////////
791839

792-
# ~ 2d ~
840+
# ~ 3e ~
793841
# Update processed status in the portal table
794842

795843
processed_ids = []

0 commit comments

Comments
 (0)