11"""Solve-RD Novelomics: new shipment file processing
2- FILE: solverd_novelomics_processing .py
2+ FILE: novelomics_shipment_processing .py
33AUTHOR: David Ruvolo
44CREATED: 2022-11-15
5- MODIFIED: 2024-03-04
6- PURPOSE: Import new novelomics data
5+ MODIFIED: 2024-03-14
6+ PURPOSE: Process new shipment manifest files - register new subjects and samples
77STATUS: stable
88PACKAGES: **see below**
99COMMENTS: NA
1414import operator
1515import re
1616from dotenv import load_dotenv
17- from datatable import dt, f, as_type
17+ from datatable import dt, f
1818from tqdm import tqdm
1919
2020from rd3tools.molgenis import Molgenis
@@ -70,6 +70,13 @@ def get_wrapped_values(val: str = None):
7070shipment_dt = dt.Frame(shipment_raw)
7171del shipment_dt['_href']
7272
73+ # if you need to delete unprocessed records due to data errors, then
74+ # run the following commands
75+ # rd3_prod.delete_list(
76+ # entity='rd3_portal_novelomics_shipment',
77+ # entities=shipment_dt['molgenis_id'].to_list()[0]
78+ # )
79+
7380# ///////////////////////////////////////////////////////////////////////////////
7481
7582# ~ 1 ~
@@ -230,24 +237,31 @@ def get_wrapped_values(val: str = None):
230237)
231238
232239# check incoming data, update mappings (if applicable), and rerun
233- new_tissue_types = dt.unique(
240+ incoming_tissue_types = dt.unique(
234241 shipment_dt[f.tissue_type != None, 'tissue_type']).to_list()[0]
235242
236- new_tissue_types .sort(key=str.lower)
237- for value in new_tissue_types :
243+ incoming_tissue_types .sort(key=str.lower)
244+ for value in incoming_tissue_types :
238245 if value.lower() not in tissue_type_mappings:
239246 print(f"Value '{value}' not in tissue type mappings")
240247
248+ # update mappings for cases that are simple recodes
241249tissue_type_mappings.update({
250+ 'adipose tissue': 'Adipose',
242251 'blood': 'Whole Blood',
243252 'cell pellet': 'Cells',
253+ 'exelid': 'Eyelid',
254+ 'fat skin': 'Adipose - Subcutaneous',
244255 'fibroblasts': 'Cells - Cultured fibroblasts',
256+ "fetus skin": "Foetus",
245257 'fetus': 'Foetus',
246258 'ffpe': 'Tumor',
247259 'heart': 'Heart',
248260 'muscle': 'Muscle - Skeletal',
249261 'pbmc': 'Peripheral Blood Mononuclear Cells',
250- 'whole blood': 'Whole Blood'
262+ 'whole blood': 'Whole Blood',
263+ 'subcutaneous fat': 'Adipose - Subcutaneous',
264+ 'tissue': 'Tissue - unspecified',
251265})
252266
253267# ///////////////////////////////////////
@@ -256,11 +270,21 @@ def get_wrapped_values(val: str = None):
256270# Create anatomical location mappings
257271
258272if 'anatomical_location' in shipment_dt.names:
259- print('Checking anatomical location mappings.... ')
273+ print('Manually check anatomical location mappings! ')
260274
261275 # As of 06 Dec 2022, the value 'blood' can be ignored as it cannot be mapped
262- # to a more specific term
276+ # to a more specific term.
277+ # As of 15 March 2024, terms that do not exist in RD3 will be labelled other.
278+ # The original value will be placed in a new column. This was implemented as
279+ # it isn't possible to determine the exact location from the supplied value.
263280 anatomical_location_mappings = {
281+ 'chest': '74964007', # Other
282+ 'left': '74964007', # Other
283+ 'nose': '74964007', # Other
284+ 'retro right auricular area': '74964007', # Other
285+ 'right': '74964007', # Other
286+ 'scalp': '74964007', # Other
287+
264288 'chest skin': '74160004', # Skin of Chest
265289 'skin scalp': '43067004', # Skin of Scalp
266290 # Entire skin of postauricular region
@@ -295,8 +319,7 @@ def get_wrapped_values(val: str = None):
295319)['id']
296320
297321material_types['mappingID'] = dt.Frame([
298- value.lower()
299- for value in material_types['id'].to_list()[0]
322+ value.lower() for value in material_types['id'].to_list()[0]
300323])
301324
302325material_type_mappings = as_key_pairs(
@@ -350,7 +373,10 @@ def get_wrapped_values(val: str = None):
350373 f"Value '{value}' does not exist in pathological state mappings")
351374
352375 # if there are any values, enter them below ->
353- # pathological_state_mappings.update({ ... })
376+ pathological_state_mappings.update({
377+ 'affected area': 'Affected',
378+ 'safe area': 'Normal'
379+ })
354380
355381# ///////////////////////////////////////////////////////////////////////////////
356382
@@ -396,7 +422,7 @@ def get_wrapped_values(val: str = None):
396422
397423# recode anatomical location (if available)
398424if 'anatomical_location' in shipment_dt.names:
399- shipment_dt['anatomical_location '] = dt.Frame([
425+ shipment_dt['tmp_anatomical_location '] = dt.Frame([
400426 recode_value(
401427 mappings=anatomical_location_mappings,
402428 value=value.lower(),
@@ -405,6 +431,15 @@ def get_wrapped_values(val: str = None):
405431 for value in shipment_dt['anatomical_location'].to_list()[0]
406432 ])
407433
434+ # identifier cases with "other"
435+ shipment_dt['anatomical_location_comment'] = dt.Frame([
436+ row[1] if row[0] == "74964007" else None
437+ for row in shipment_dt[:, ['tmp_anatomical_location', 'anatomical_location']].to_tuples()
438+ ])
439+
440+ shipment_dt['anatomical_location'] = shipment_dt['tmp_anatomical_location']
441+ del shipment_dt['tmp_anatomical_location']
442+
408443# recode sample types (i.e., materialType)
409444shipment_dt['sample_type'] = dt.Frame([
410445 'TISSUE (FFPE)'
@@ -531,7 +566,9 @@ def get_wrapped_values(val: str = None):
531566 'tissue_type': 'tissueType',
532567 'sample_type': 'materialType',
533568 'pathological_state': 'pathological_state',
534- 'tumor_cell_fraction': 'percentageTumorCells'
569+ 'tumor_cell_fraction': 'percentageTumorCells',
570+ 'anatomical_location': 'anatomicalLocation',
571+ 'anatomical_location_comment': 'anatomicalLocationComment',
535572}
536573
537574# ///////////////////////////////////////////////////////////////////////////////
@@ -606,15 +643,18 @@ def get_wrapped_values(val: str = None):
606643 curr_sample = dt_as_recordset(samples_dt[f.sampleID == sample_id, :])[0]
607644
608645 # identify records that require manually verification
609- # for column in columns_with_major_conflicts:
610- # if (column in incomingSample) and (column in existingSample):
611- # if incomingSample[column] != existingSample[column]:
612- # print(f"Incoming sample {id} has conflicting {column} values")
613- # samples_with_conflicts.append({
614- # 'incomingValue': incomingSample[column],
615- # 'existingValue': existingSample[column],
616- # 'message': f"values in {column} do not match"
617- # })
646+ for column in columns_with_major_conflicts:
647+ if (column in new_sample) and (column in curr_sample):
648+ if new_sample[column] is not None and curr_sample[column] is not None:
649+ if new_sample[column] not in curr_sample[column]:
650+ new_row = {
651+ 'sampleID': sample_id,
652+ 'subjectID': new_sample['subjectID'],
653+ }
654+ curr_values = curr_sample[column].split(',')
655+ curr_values.append(new_sample[column])
656+ new_row[column] = ','.join(list(set(curr_values)))
657+ samples_with_conflicts.append(new_row)
618658
619659 # identify columns that can automatically imported
620660 for column in columns_with_minor_conflicts:
@@ -661,14 +701,18 @@ def get_wrapped_values(val: str = None):
661701# ~ 3a ~
662702# Import new subject metadata
663703new_subjects_dt = shipment_dt[
664- f.isNewSubject, (
704+ f.isNewSubject,
705+ (
665706 f.subjectID,
666707 f.organisation,
667708 f.ERN,
668709 f.partOfRelease,
669710 f.dateRecordCreated,
670- f.recordCreatedBy)]
711+ f.recordCreatedBy
712+ )
713+ ]
671714
715+ # if there are no more subjects, then you can skip to 3b
672716if not new_subjects_dt.nrows:
673717 print('No subjects to import. You may skip this step')
674718
@@ -687,7 +731,7 @@ def get_wrapped_values(val: str = None):
687731
688732# ///////////////////////////////////////
689733
690- # ~ 2b ~
734+ # ~ 3b ~
691735# Import new sample metadata
692736new_samples_dt = shipment_dt[
693737 f.isNewSample, (
@@ -698,19 +742,23 @@ def get_wrapped_values(val: str = None):
698742 f.materialType,
699743 f.pathological_state,
700744 f.percentageTumorCells,
745+ f.anatomicalLocation,
746+ f.anatomicalLocationComment,
701747 f.partOfRelease,
702748 f.batch,
703749 f.organisation,
704750 f.ERN,
705751 f.dateRecordCreated,
706752 f.recordCreatedBy)]
707753
754+ new_samples_dt['retracted'] = 'N'
708755new_samples_dt.names = {'subjectID': 'belongsToSubject'}
756+
709757rd3_prod.import_dt('solverd_samples', new_samples_dt)
710758
711759# ///////////////////////////////////////
712760
713- # ~ 2b ~
761+ # ~ 3c ~
714762# Update subject release information
715763
716764existing_subjects_dt = shipment_dt[f.isNewSubject == False, :]
@@ -743,7 +791,7 @@ def get_wrapped_values(val: str = None):
743791
744792# ///////////////////////////////////////
745793
746- # ~ 2c ~
794+ # ~ 3d ~
747795# Update release and batch info in the samples table
748796
749797if bool(samples_with_updates):
@@ -789,7 +837,7 @@ def get_wrapped_values(val: str = None):
789837
790838# ///////////////////////////////////////
791839
792- # ~ 2d ~
840+ # ~ 3e ~
793841# Update processed status in the portal table
794842
795843processed_ids = []
0 commit comments