Skip to content

Commit 843c0ee

Browse files
committed
- Added column to the segmentation step FCS result file indicating if the membrane marker has affected the final segmentation shape
- Added confidence threshold for cluster refinement - Added unlimited rules for cluster refinement - Fixed bug while copying log file on pipex process error - Fixed/added bit depth in segmentation results - Added date to log traces - Updated TissUUmaps library version - Updated psutil library version
1 parent daf713b commit 843c0ee

File tree

10 files changed

+129
-116
lines changed

10 files changed

+129
-116
lines changed

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,12 @@ Installation
6161
- Profit!
6262

6363
[Note] a docker image is also provided. Just remember to configure the environment variable for PIPEX's "work" directory.
64-
64+
[Note] if you are installing PIPEX in your own laptop you might need some other packages/libraries. Please install these:
65+
- `sudo apt-get install python3.11-dev`
66+
- `sudo apt-get install libvips`
67+
- `sudo apt-get install -y python3-opencv`
68+
- `sudo apt-get install -y gcc`
69+
- `sudo apt-get install python3-tk`
6570

6671

6772
Running PIPEX

analysis.py

Lines changed: 38 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def data_calculations():
6060
if len(analysis_markers) > 0:
6161
markers = analysis_markers
6262

63-
print(">>> List of markers to analyze gathered ",markers," =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
63+
print(">>> List of markers to analyze gathered ",markers," =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
6464

6565
for marker in markers:
6666
df_norm[marker] = pd.to_numeric(df_norm[marker]).fillna(0)
@@ -86,7 +86,7 @@ def data_calculations():
8686
df_norm.drop(index=filter_set, axis=0, inplace=True)
8787

8888
if cellsize_min > 0 or cellsize_max > 0 or custom_filter == 'yes':
89-
print(">>> Cells filtered =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
89+
print(">>> Cells filtered =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
9090

9191
#We normalize all the markers through min-max
9292
for marker in markers:
@@ -98,7 +98,7 @@ def data_calculations():
9898
df_norm[marker] = df_norm[marker].apply(lambda x: (x - marker_min) / (marker_max - marker_min))
9999

100100
if log_norm == 'yes' or std_norm == 'yes':
101-
print(">>> Markers normalized =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
101+
print(">>> Markers normalized =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
102102

103103
#Alternative normalization via z-score
104104
#for marker in markers:
@@ -118,13 +118,13 @@ def data_calculations():
118118

119119
df_norm[markers] = pycombat(df_norm[markers].transpose(), batch).transpose()
120120

121-
print(">>> ComBat batch correction performed =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
121+
print(">>> ComBat batch correction performed =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
122122

123123
#Quantile normalization
124124
if quantile_norm == 'yes':
125125
df_norm[markers] = qnorm.quantile_normalize(df_norm[markers].transpose()).transpose()
126126

127-
print(">>> Quantile normalization performed =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
127+
print(">>> Quantile normalization performed =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
128128

129129
df_norm.to_csv(data_folder + '/analysis/downstream/cell_data_norm.csv', index=False)
130130

@@ -204,13 +204,13 @@ def data_calculations():
204204
plt.close()
205205

206206
df_ext.to_csv(data_folder + '/analysis/downstream/cell_data_markers.csv', index=False)
207-
print(">>> Markers information calculated =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
207+
print(">>> Markers information calculated =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
208208

209209
del df_corr
210210
del df_ext
211211

212212
fill_surface_html_template(markers, df_norm)
213-
print(">>> Markers surface plot generated =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
213+
print(">>> Markers surface plot generated =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
214214

215215
return df_norm, markers
216216

@@ -301,30 +301,16 @@ def check_cell_type(row, cluster_id, clustering_merge_data):
301301
high_threshold = clustering_merge_data['scores'][cluster_id]['q75']
302302
low_threshold = clustering_merge_data['scores'][cluster_id]['q25']
303303
final_score = 0
304-
num_rules = 0
305-
if not pd.isnull(row['marker1']):
306-
curr_marker = row['marker1']
307-
curr_rule = row['rule1']
304+
num_rules = 1
305+
while not pd.isnull(row['marker' + str(num_rules)]):
306+
curr_marker = row['marker' + str(num_rules)]
307+
curr_rule = row['rule' + str(num_rules)]
308308
num_rules = num_rules + 1
309309
if curr_marker in clustering_merge_data['scores'][cluster_id]['markers']:
310310
curr_score = clustering_merge_data['scores'][cluster_id]['markers'][curr_marker]
311311
final_score = check_cell_type_threshold(curr_marker, curr_rule, curr_score, high_threshold, low_threshold)
312-
if not pd.isnull(row['marker2']):
313-
curr_marker = row['marker2']
314-
curr_rule = row['rule2']
315-
num_rules = num_rules + 1
316-
if curr_marker in clustering_merge_data['scores'][cluster_id]['markers']:
317-
curr_score = clustering_merge_data['scores'][cluster_id]['markers'][curr_marker]
318-
final_score = final_score + check_cell_type_threshold(curr_marker, curr_rule, curr_score, high_threshold, low_threshold)
319-
if not pd.isnull(row['marker3']):
320-
curr_marker = row['marker3']
321-
curr_rule = row['rule3']
322-
num_rules = num_rules + 1
323-
if curr_marker in clustering_merge_data['scores'][cluster_id]['markers']:
324-
curr_score = clustering_merge_data['scores'][cluster_id]['markers'][curr_marker]
325-
final_score = final_score + check_cell_type_threshold(curr_marker, curr_rule, curr_score, high_threshold, low_threshold)
326312
if final_score > 0:
327-
return final_score / num_rules
313+
return final_score / (num_rules - 1)
328314

329315
return None
330316

@@ -412,21 +398,26 @@ def refine_clustering(adata, cluster_type):
412398
if cell_type_prob is not None:
413399
if clustering_merge_data['scores'][cluster_id]['score_dif'] < clustering_merge_data['dif_median']:
414400
cell_type_prob = cell_type_prob * clustering_merge_data['scores'][cluster_id]['score_dif'] / clustering_merge_data['dif_median']
415-
clustering_merge_data['cell_types'][cluster_id].append({ 'cell_type' : row['cell_group'] + '.' + row['cell_type'] + '.' + row['cell_subtype'], 'prob' : cell_type_prob})
401+
clustering_merge_data['cell_types'][cluster_id].append({ 'cell_type' : row['cell_group'] + '.' + row['cell_type'] + '.' + row['cell_subtype'], 'prob' : cell_type_prob, 'confidence_threshold' : row['min_confidence']})
416402
clustering_merge_data['candidates'] = {}
417403
adata.obs[cluster_type + "_ref"] = adata.obs[cluster_type].astype(str)
418404
adata.obs[cluster_type + "_ref_p"] = adata.obs[cluster_type].astype(str)
419405
ordered_cluster_keys = list(clustering_merge_data['cell_types'])
420406
ordered_cluster_keys.sort()
421407
for cluster_id in ordered_cluster_keys:
422408
best_candidate = None
409+
best_real_confidence = 0
423410
for curr_cell_type in clustering_merge_data['cell_types'][cluster_id]:
411+
curr_real_confidence = curr_cell_type['prob']
424412
curr_cell_type['prob'] = curr_cell_type['prob'] / len(clustering_merge_data['cell_types'][cluster_id])
425-
if best_candidate is None or best_candidate['prob'] < curr_cell_type['prob']:
413+
if (best_candidate is None or best_candidate['prob'] < curr_cell_type['prob']) and curr_real_confidence >= int(curr_cell_type['confidence_threshold']):
426414
best_candidate = { 'cell_type': curr_cell_type['cell_type'], 'prob' : curr_cell_type['prob'], 'real_confidence' : '{:.1%}'.format((curr_cell_type['prob'] * len(clustering_merge_data['cell_types'][cluster_id])) / 100.0) }
427-
clustering_merge_data['candidates'][cluster_id] = best_candidate
428-
adata.obs.loc[adata.obs[cluster_type + "_ref"] == cluster_id, cluster_type + "_ref"] = best_candidate['cell_type']
429-
adata.obs.loc[adata.obs[cluster_type + "_ref_p"] == cluster_id, cluster_type + "_ref_p"] = best_candidate['real_confidence'][:-1]
415+
best_real_confidence = curr_real_confidence
416+
417+
if best_real_confidence > 0:
418+
clustering_merge_data['candidates'][cluster_id] = best_candidate
419+
adata.obs.loc[adata.obs[cluster_type + "_ref"] == cluster_id, cluster_type + "_ref"] = best_candidate['cell_type']
420+
adata.obs.loc[adata.obs[cluster_type + "_ref_p"] == cluster_id, cluster_type + "_ref_p"] = best_candidate['real_confidence'][:-1]
430421

431422
with open(data_folder + '/analysis/downstream/cell_types_result_' + cluster_type + '.json', 'w') as outfile:
432423
json.dump(clustering_merge_data, outfile, indent = 4)
@@ -445,7 +436,7 @@ def clustering(df_norm, markers):
445436
adata.obs["y"] = np.array(df_norm.loc[:, 'y'])
446437

447438
adata.obsm["spatial"] = np.array(df_norm.loc[:, ['x', 'y']])
448-
print(">>> Anndata object created =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
439+
print(">>> Anndata object created =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
449440

450441
#We take the chance to show spatially the intensities of every marker
451442
if batch_corr == '' and len(df_norm.index) <= max_samples:
@@ -455,20 +446,20 @@ def clustering(df_norm, markers):
455446
plt.clf()
456447
plt.close()
457448
else:
458-
print(">>> Dataset too big to create spatial plots per marker =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
449+
print(">>> Dataset too big to create spatial plots per marker =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
459450

460451
#We calculate PCA, neighbors and UMAP for the anndata
461452
sc.pp.pca(adata)
462453
adata.obsm['X_pca'] = np.nan_to_num(adata.obsm['X_pca'], copy=False)
463-
print(">>> PCA calculated =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
454+
print(">>> PCA calculated =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
464455

465456
num_neighbors = int(max(5, 15 * min(1, max_samples / len(df_norm.index))))
466-
print(">>> n_neighbors set to",num_neighbors,"=", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
457+
print(">>> n_neighbors set to",num_neighbors,"=", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
467458
sc.pp.neighbors(adata, n_neighbors=num_neighbors)
468-
print(">>> Neighbors graph calculated =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
459+
print(">>> Neighbors graph calculated =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
469460

470461
sc.tl.umap(adata)
471-
print(">>> UMAP calculated =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
462+
print(">>> UMAP calculated =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
472463
plt.figure()
473464
sc.pl.umap(adata, show=False, save='_base')
474465
plt.clf()
@@ -482,7 +473,7 @@ def clustering(df_norm, markers):
482473
if (leiden == 'yes'):
483474
#We calculate leiden cluster
484475
sc.tl.leiden(adata)
485-
print(">>> Leiden cluster calculated =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
476+
print(">>> Leiden cluster calculated =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
486477

487478
#We print the complete leiden cluster and all related information
488479
calculate_cluster_info(adata, "leiden")
@@ -492,13 +483,13 @@ def clustering(df_norm, markers):
492483
refine_clustering(adata, 'leiden')
493484
calculate_cluster_info(adata, "leiden_ref")
494485
except Exception as e:
495-
print(">>> Failed at refining leiden cluster =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
486+
print(">>> Failed at refining leiden cluster =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
496487

497488
if (kmeans == 'yes'):
498489
if (elbow == 'yes'):
499490
if (len(df_norm.index) <= max_samples):
500491
#We calculate all kmeans clusters with k 1 to 20 so we can show the elbow method plots
501-
print(">>> Performing kmeans elbow method =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
492+
print(">>> Performing kmeans elbow method =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
502493
distortions = []
503494
inertias = []
504495
mapping1 = {}
@@ -516,7 +507,7 @@ def clustering(df_norm, markers):
516507
mapping1[k] = sum(np.min(cdist(adata.obsm['X_pca'], kmeanModel.cluster_centers_,
517508
'euclidean'), axis=1)) / adata.obsm['X_pca'].shape[0]
518509
mapping2[k] = kmeanModel.inertia_
519-
print(">>> Kmeans cluster calculated with k",k," =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
510+
print(">>> Kmeans cluster calculated with k",k," =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
520511

521512
plt.figure()
522513
plt.plot(K, distortions, 'bx-')
@@ -535,9 +526,9 @@ def clustering(df_norm, markers):
535526
plt.savefig(data_folder + '/analysis/downstream/elbow_inertia.jpg')
536527
plt.clf()
537528
plt.close()
538-
print(">>> Elbow method calculated =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
529+
print(">>> Elbow method calculated =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
539530
else:
540-
print(">>> Dataset too big to perform elbow method =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
531+
print(">>> Dataset too big to perform elbow method =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
541532

542533
#We print the complete spatial kmeans cluster and all related information. Please note that the k used is the one passad as parameter (or 10 by default)
543534
kmeans_cluster = KMeans(n_clusters=k_clusters, random_state=0).fit(adata.obsm['X_pca'])
@@ -551,7 +542,8 @@ def clustering(df_norm, markers):
551542
refine_clustering(adata, 'kmeans')
552543
calculate_cluster_info(adata, "kmeans_ref")
553544
except Exception as e:
554-
print(">>> Failed at refining kmeans cluster =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
545+
print(">>> Failed at refining kmeans cluster =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
546+
print(e,flush=True)
555547

556548
if (leiden == 'yes' or kmeans == 'yes'):
557549
df = pd.read_csv(data_folder + '/analysis/cell_data.csv')
@@ -667,7 +659,7 @@ def options(argv):
667659
f.write(str(os.getpid()))
668660
f.close()
669661

670-
print(">>> Start time analysis =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
662+
print(">>> Start time analysis =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
671663

672664
try:
673665
os.mkdir(data_folder + '/analysis/downstream')
@@ -687,4 +679,4 @@ def options(argv):
687679

688680
clustering(df_norm, markers)
689681

690-
print(">>> End time analysis =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
682+
print(">>> End time analysis =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)

generate_filtered_masks.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def options(argv):
6363
f.write(str(os.getpid()))
6464
f.close()
6565

66-
print(">>> Start time generate_filtered_masks =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
66+
print(">>> Start time generate_filtered_masks =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
6767

6868
#Load segmentation data in numpy array format
6969
labels = np.load(data_folder + '/analysis/segmentation_data.npy')
@@ -80,16 +80,16 @@ def options(argv):
8080
labels = np.where(ix, labels, 0)
8181

8282
np.save(data_folder + '/analysis/segmentation_data_filtered.npy', labels)
83-
print(">>> Filtered segmentation result numpy binary data saved =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
83+
print(">>> Filtered segmentation result numpy binary data saved =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
8484

8585
labelsBinary = np.copy(labels)
8686
labelsBinary[labelsBinary > 0] = 1
8787
imsave(data_folder + "/analysis/segmentation_filtered_binary_mask.tif", np.uint16(labelsBinary * 65535))
88-
print(">>> Filtered segmentation result binary image saved =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
88+
print(">>> Filtered segmentation result binary image saved =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
8989
del labelsBinary
9090

9191
imsave(data_folder + "/analysis/segmentation_filtered_mask.tif", np.uint16(labels * 65535))
92-
print(">>> Filtered segmentation result image saved =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
92+
print(">>> Filtered segmentation result image saved =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
9393

9494
if (tile_size > 0):
9595
if (tile_overlap == 0 and tile_overlap_percentage > 0):
@@ -127,16 +127,16 @@ def options(argv):
127127
tile = relabel_sequential(tile)[0]
128128
tile_desc = str(row) + '_' + str(column)
129129
np.save(data_folder + '/analysis/segmentation_data_filtered_tile_' + tile_desc + '.npy', tile)
130-
print(">>> Filtered segmentation tile ",tile_desc," result numpy binary data saved =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
130+
print(">>> Filtered segmentation tile ",tile_desc," result numpy binary data saved =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
131131

132132
tileBinary = np.copy(tile)
133133
tileBinary[tileBinary > 0] = 1
134134
imsave(data_folder + "/analysis/segmentation_filtered_tile_" + tile_desc + "_binary_mask.tif", np.uint16(tileBinary * 65535))
135-
print(">>> Filtered segmentation tile ",tile_desc," result binary image saved =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
135+
print(">>> Filtered segmentation tile ",tile_desc," result binary image saved =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
136136
del tileBinary
137137

138138
imsave(data_folder + "/analysis/segmentation_filtered_tile_" + tile_desc + "_mask.tif", np.uint16(tile * 65535))
139-
print(">>> Filtered segmentation tile ",tile_desc," result image saved =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
139+
print(">>> Filtered segmentation tile ",tile_desc," result image saved =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
140140

141141

142-
print(">>> End time generate_filtered_masks =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
142+
print(">>> End time generate_filtered_masks =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)

generate_geojson.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def options(argv):
4949
f.write(str(os.getpid()))
5050
f.close()
5151

52-
print(">>> Start time generate_geojson =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
52+
print(">>> Start time generate_geojson =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
5353

5454
#Load segmentation data in numpy array format
5555
labels = np.load(data_folder + '/analysis/segmentation_data.npy')
@@ -76,7 +76,7 @@ def options(argv):
7676
borders = {}
7777
for chaincode in chaincodes:
7878
borders[chaincode.objectID] = np.array(chaincode.Polygon()).tolist()
79-
print(">>> Labelled regions to approximate polygons conversion finished =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
79+
print(">>> Labelled regions to approximate polygons conversion finished =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)
8080

8181
#generating geojson data to import in qupath
8282
GEOdata = []
@@ -119,4 +119,4 @@ def options(argv):
119119
with open(data_folder + '/analysis/cell_segmentation_geo.json', 'w') as outfile:
120120
geojson.dump(GEOdata, outfile)
121121

122-
print(">>> End time generate_geojson =", datetime.datetime.now().strftime("%H:%M:%S"), flush=True)
122+
print(">>> End time generate_geojson =", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), flush=True)

0 commit comments

Comments
 (0)