@@ -60,7 +60,7 @@ def data_calculations():
6060 if len (analysis_markers ) > 0 :
6161 markers = analysis_markers
6262
63- print (">>> List of markers to analyze gathered " ,markers ," =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
63+ print (">>> List of markers to analyze gathered " ,markers ," =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
6464
6565 for marker in markers :
6666 df_norm [marker ] = pd .to_numeric (df_norm [marker ]).fillna (0 )
@@ -86,7 +86,7 @@ def data_calculations():
8686 df_norm .drop (index = filter_set , axis = 0 , inplace = True )
8787
8888 if cellsize_min > 0 or cellsize_max > 0 or custom_filter == 'yes' :
89- print (">>> Cells filtered =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
89+ print (">>> Cells filtered =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
9090
9191 #We normalize all the markers through min-max
9292 for marker in markers :
@@ -98,7 +98,7 @@ def data_calculations():
9898 df_norm [marker ] = df_norm [marker ].apply (lambda x : (x - marker_min ) / (marker_max - marker_min ))
9999
100100 if log_norm == 'yes' or std_norm == 'yes' :
101- print (">>> Markers normalized =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
101+ print (">>> Markers normalized =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
102102
103103 #Alternative normalization via z-score
104104 #for marker in markers:
@@ -118,13 +118,13 @@ def data_calculations():
118118
119119 df_norm [markers ] = pycombat (df_norm [markers ].transpose (), batch ).transpose ()
120120
121- print (">>> ComBat batch correction performed =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
121+ print (">>> ComBat batch correction performed =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
122122
123123 #Quantile normalization
124124 if quantile_norm == 'yes' :
125125 df_norm [markers ] = qnorm .quantile_normalize (df_norm [markers ].transpose ()).transpose ()
126126
127- print (">>> Quantile normalization performed =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
127+ print (">>> Quantile normalization performed =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
128128
129129 df_norm .to_csv (data_folder + '/analysis/downstream/cell_data_norm.csv' , index = False )
130130
@@ -204,13 +204,13 @@ def data_calculations():
204204 plt .close ()
205205
206206 df_ext .to_csv (data_folder + '/analysis/downstream/cell_data_markers.csv' , index = False )
207- print (">>> Markers information calculated =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
207+ print (">>> Markers information calculated =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
208208
209209 del df_corr
210210 del df_ext
211211
212212 fill_surface_html_template (markers , df_norm )
213- print (">>> Markers surface plot generated =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
213+ print (">>> Markers surface plot generated =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
214214
215215 return df_norm , markers
216216
@@ -301,30 +301,16 @@ def check_cell_type(row, cluster_id, clustering_merge_data):
301301 high_threshold = clustering_merge_data ['scores' ][cluster_id ]['q75' ]
302302 low_threshold = clustering_merge_data ['scores' ][cluster_id ]['q25' ]
303303 final_score = 0
304- num_rules = 0
305- if not pd .isnull (row ['marker1' ]):
306- curr_marker = row ['marker1' ]
307- curr_rule = row ['rule1' ]
304+ num_rules = 1
305+ while not pd .isnull (row ['marker' + str ( num_rules ) ]):
306+ curr_marker = row ['marker' + str ( num_rules ) ]
307+ curr_rule = row ['rule' + str ( num_rules ) ]
308308 num_rules = num_rules + 1
309309 if curr_marker in clustering_merge_data ['scores' ][cluster_id ]['markers' ]:
310310 curr_score = clustering_merge_data ['scores' ][cluster_id ]['markers' ][curr_marker ]
311311 final_score = check_cell_type_threshold (curr_marker , curr_rule , curr_score , high_threshold , low_threshold )
312- if not pd .isnull (row ['marker2' ]):
313- curr_marker = row ['marker2' ]
314- curr_rule = row ['rule2' ]
315- num_rules = num_rules + 1
316- if curr_marker in clustering_merge_data ['scores' ][cluster_id ]['markers' ]:
317- curr_score = clustering_merge_data ['scores' ][cluster_id ]['markers' ][curr_marker ]
318- final_score = final_score + check_cell_type_threshold (curr_marker , curr_rule , curr_score , high_threshold , low_threshold )
319- if not pd .isnull (row ['marker3' ]):
320- curr_marker = row ['marker3' ]
321- curr_rule = row ['rule3' ]
322- num_rules = num_rules + 1
323- if curr_marker in clustering_merge_data ['scores' ][cluster_id ]['markers' ]:
324- curr_score = clustering_merge_data ['scores' ][cluster_id ]['markers' ][curr_marker ]
325- final_score = final_score + check_cell_type_threshold (curr_marker , curr_rule , curr_score , high_threshold , low_threshold )
326312 if final_score > 0 :
327- return final_score / num_rules
313+ return final_score / ( num_rules - 1 )
328314
329315 return None
330316
@@ -412,21 +398,26 @@ def refine_clustering(adata, cluster_type):
412398 if cell_type_prob is not None :
413399 if clustering_merge_data ['scores' ][cluster_id ]['score_dif' ] < clustering_merge_data ['dif_median' ]:
414400 cell_type_prob = cell_type_prob * clustering_merge_data ['scores' ][cluster_id ]['score_dif' ] / clustering_merge_data ['dif_median' ]
415- clustering_merge_data ['cell_types' ][cluster_id ].append ({ 'cell_type' : row ['cell_group' ] + '.' + row ['cell_type' ] + '.' + row ['cell_subtype' ], 'prob' : cell_type_prob })
401+ clustering_merge_data ['cell_types' ][cluster_id ].append ({ 'cell_type' : row ['cell_group' ] + '.' + row ['cell_type' ] + '.' + row ['cell_subtype' ], 'prob' : cell_type_prob , 'confidence_threshold' : row [ 'min_confidence' ] })
416402 clustering_merge_data ['candidates' ] = {}
417403 adata .obs [cluster_type + "_ref" ] = adata .obs [cluster_type ].astype (str )
418404 adata .obs [cluster_type + "_ref_p" ] = adata .obs [cluster_type ].astype (str )
419405 ordered_cluster_keys = list (clustering_merge_data ['cell_types' ])
420406 ordered_cluster_keys .sort ()
421407 for cluster_id in ordered_cluster_keys :
422408 best_candidate = None
409+ best_real_confidence = 0
423410 for curr_cell_type in clustering_merge_data ['cell_types' ][cluster_id ]:
411+ curr_real_confidence = curr_cell_type ['prob' ]
424412 curr_cell_type ['prob' ] = curr_cell_type ['prob' ] / len (clustering_merge_data ['cell_types' ][cluster_id ])
425- if best_candidate is None or best_candidate ['prob' ] < curr_cell_type ['prob' ]:
413+ if ( best_candidate is None or best_candidate ['prob' ] < curr_cell_type ['prob' ]) and curr_real_confidence >= int ( curr_cell_type [ 'confidence_threshold' ]) :
426414 best_candidate = { 'cell_type' : curr_cell_type ['cell_type' ], 'prob' : curr_cell_type ['prob' ], 'real_confidence' : '{:.1%}' .format ((curr_cell_type ['prob' ] * len (clustering_merge_data ['cell_types' ][cluster_id ])) / 100.0 ) }
427- clustering_merge_data ['candidates' ][cluster_id ] = best_candidate
428- adata .obs .loc [adata .obs [cluster_type + "_ref" ] == cluster_id , cluster_type + "_ref" ] = best_candidate ['cell_type' ]
429- adata .obs .loc [adata .obs [cluster_type + "_ref_p" ] == cluster_id , cluster_type + "_ref_p" ] = best_candidate ['real_confidence' ][:- 1 ]
415+ best_real_confidence = curr_real_confidence
416+
417+ if best_real_confidence > 0 :
418+ clustering_merge_data ['candidates' ][cluster_id ] = best_candidate
419+ adata .obs .loc [adata .obs [cluster_type + "_ref" ] == cluster_id , cluster_type + "_ref" ] = best_candidate ['cell_type' ]
420+ adata .obs .loc [adata .obs [cluster_type + "_ref_p" ] == cluster_id , cluster_type + "_ref_p" ] = best_candidate ['real_confidence' ][:- 1 ]
430421
431422 with open (data_folder + '/analysis/downstream/cell_types_result_' + cluster_type + '.json' , 'w' ) as outfile :
432423 json .dump (clustering_merge_data , outfile , indent = 4 )
@@ -445,7 +436,7 @@ def clustering(df_norm, markers):
445436 adata .obs ["y" ] = np .array (df_norm .loc [:, 'y' ])
446437
447438 adata .obsm ["spatial" ] = np .array (df_norm .loc [:, ['x' , 'y' ]])
448- print (">>> Anndata object created =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
439+ print (">>> Anndata object created =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
449440
450441 #We take the chance to show spatially the intensities of every marker
451442 if batch_corr == '' and len (df_norm .index ) <= max_samples :
@@ -455,20 +446,20 @@ def clustering(df_norm, markers):
455446 plt .clf ()
456447 plt .close ()
457448 else :
458- print (">>> Dataset too big to create spatial plots per marker =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
449+ print (">>> Dataset too big to create spatial plots per marker =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
459450
460451 #We calculate PCA, neighbors and UMAP for the anndata
461452 sc .pp .pca (adata )
462453 adata .obsm ['X_pca' ] = np .nan_to_num (adata .obsm ['X_pca' ], copy = False )
463- print (">>> PCA calculated =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
454+ print (">>> PCA calculated =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
464455
465456 num_neighbors = int (max (5 , 15 * min (1 , max_samples / len (df_norm .index ))))
466- print (">>> n_neighbors set to" ,num_neighbors ,"=" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
457+ print (">>> n_neighbors set to" ,num_neighbors ,"=" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
467458 sc .pp .neighbors (adata , n_neighbors = num_neighbors )
468- print (">>> Neighbors graph calculated =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
459+ print (">>> Neighbors graph calculated =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
469460
470461 sc .tl .umap (adata )
471- print (">>> UMAP calculated =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
462+ print (">>> UMAP calculated =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
472463 plt .figure ()
473464 sc .pl .umap (adata , show = False , save = '_base' )
474465 plt .clf ()
@@ -482,7 +473,7 @@ def clustering(df_norm, markers):
482473 if (leiden == 'yes' ):
483474 #We calculate leiden cluster
484475 sc .tl .leiden (adata )
485- print (">>> Leiden cluster calculated =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
476+ print (">>> Leiden cluster calculated =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
486477
487478 #We print the complete leiden cluster and all related information
488479 calculate_cluster_info (adata , "leiden" )
@@ -492,13 +483,13 @@ def clustering(df_norm, markers):
492483 refine_clustering (adata , 'leiden' )
493484 calculate_cluster_info (adata , "leiden_ref" )
494485 except Exception as e :
495- print (">>> Failed at refining leiden cluster =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
486+ print (">>> Failed at refining leiden cluster =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
496487
497488 if (kmeans == 'yes' ):
498489 if (elbow == 'yes' ):
499490 if (len (df_norm .index ) <= max_samples ):
500491 #We calculate all kmeans clusters with k 1 to 20 so we can show the elbow method plots
501- print (">>> Performing kmeans elbow method =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
492+ print (">>> Performing kmeans elbow method =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
502493 distortions = []
503494 inertias = []
504495 mapping1 = {}
@@ -516,7 +507,7 @@ def clustering(df_norm, markers):
516507 mapping1 [k ] = sum (np .min (cdist (adata .obsm ['X_pca' ], kmeanModel .cluster_centers_ ,
517508 'euclidean' ), axis = 1 )) / adata .obsm ['X_pca' ].shape [0 ]
518509 mapping2 [k ] = kmeanModel .inertia_
519- print (">>> Kmeans cluster calculated with k" ,k ," =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
510+ print (">>> Kmeans cluster calculated with k" ,k ," =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
520511
521512 plt .figure ()
522513 plt .plot (K , distortions , 'bx-' )
@@ -535,9 +526,9 @@ def clustering(df_norm, markers):
535526 plt .savefig (data_folder + '/analysis/downstream/elbow_inertia.jpg' )
536527 plt .clf ()
537528 plt .close ()
538- print (">>> Elbow method calculated =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
529+ print (">>> Elbow method calculated =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
539530 else :
540- print (">>> Dataset too big to perform elbow method =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
531+ print (">>> Dataset too big to perform elbow method =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
541532
542533 #We print the complete spatial kmeans cluster and all related information. Please note that the k used is the one passad as parameter (or 10 by default)
543534 kmeans_cluster = KMeans (n_clusters = k_clusters , random_state = 0 ).fit (adata .obsm ['X_pca' ])
@@ -551,7 +542,8 @@ def clustering(df_norm, markers):
551542 refine_clustering (adata , 'kmeans' )
552543 calculate_cluster_info (adata , "kmeans_ref" )
553544 except Exception as e :
554- print (">>> Failed at refining kmeans cluster =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
545+ print (">>> Failed at refining kmeans cluster =" , datetime .datetime .now ().strftime ("%d/%m/%Y %H:%M:%S" ), flush = True )
546+ print (e ,flush = True )
555547
556548 if (leiden == 'yes' or kmeans == 'yes' ):
557549 df = pd .read_csv (data_folder + '/analysis/cell_data.csv' )
@@ -667,7 +659,7 @@ def options(argv):
667659 f .write (str (os .getpid ()))
668660 f .close ()
669661
670- print (">>> Start time analysis =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
662+ print (">>> Start time analysis =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
671663
672664 try :
673665 os .mkdir (data_folder + '/analysis/downstream' )
@@ -687,4 +679,4 @@ def options(argv):
687679
688680 clustering (df_norm , markers )
689681
690- print (">>> End time analysis =" , datetime .datetime .now ().strftime ("%H:%M:%S" ), flush = True )
682+ print (">>> End time analysis =" , datetime .datetime .now ().strftime ("%d/%m/%Y % H:%M:%S" ), flush = True )
0 commit comments