Skip to content

Commit e99876e

Browse files
committed
Add baysor
1 parent 8b8b69d commit e99876e

File tree

2 files changed

+270
-0
lines changed

2 files changed

+270
-0
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
__merge__: /src/api/comp_method_transcript_assignment.yaml
2+
3+
name: baysor
4+
label: "Baysor Transcript Assignment"
5+
summary: "Assign transcripts to cells using the Baysor method"
6+
description: "Baysor is a tool for performing cell segmentation on imaging-based spatial transcriptomics data. It optimizes segmentation considering the likelihood of transcriptional composition, size and shape of the cell."
7+
links:
8+
documentation: "https://kharchenkolab.github.io/Baysor/dev/"
9+
repository: "https://github.com/kharchenkolab/Baysor"
10+
references:
11+
doi: "10.1038/s41587-021-01044-w"
12+
13+
arguments:
14+
- name: --transcripts_key
15+
type: string
16+
description: The key of the transcripts within the points of the spatial data
17+
default: transcripts
18+
- name: --coordinate_system
19+
type: string
20+
description: The key of the pixel space coordinate system within the spatial data
21+
default: global
22+
23+
- name: --force_2d
24+
type: string
25+
required: false
26+
description: "Ignores z-column in the data if it is provided"
27+
direction: input
28+
default: "false"
29+
30+
- name: --min_molecules_per_cell
31+
type: integer
32+
required: false
33+
description: "Minimal number of molecules per cell"
34+
direction: input
35+
default: 50
36+
37+
- name: --scale
38+
type: double
39+
required: false
40+
description: |
41+
"Scale parameter, which suggest approximate cell radius for the algorithm. Must be in the same units as
42+
x and y molecule coordinates. Negative values mean it must be estimated from `min_molecules_per_cell`."
43+
direction: input
44+
default: -1.0
45+
46+
- name: --scale_std
47+
type: string
48+
required: false
49+
description: "Standard deviation of scale across cells relative to `scale`"
50+
direction: input
51+
default: "25%"
52+
53+
- name: --n_clusters
54+
type: integer
55+
required: false
56+
description: "Number of molecule clusters, i.e. major cell types."
57+
direction: input
58+
default: 4
59+
60+
- name: --prior_segmentation_confidence
61+
type: double
62+
required: false
63+
description: "Confidence of the prior segmentation"
64+
direction: input
65+
default: 0.8
66+
67+
resources:
68+
- type: python_script
69+
path: script.py
70+
71+
engines:
72+
- type: docker
73+
image: openproblems/base_python:1
74+
__merge__:
75+
- /src/base/setup_spatialdata_partial.yaml
76+
setup:
77+
- type: docker
78+
run:
79+
- wget https://github.com/kharchenkolab/Baysor/releases/download/v0.7.1/baysor-x86_x64-linux-v0.7.1_build.zip
80+
- unzip baysor-x86_x64-linux-v0.7.1_build.zip
81+
- chmod +x /bin/baysor
82+
- ln -sf /bin/baysor/bin/baysor /usr/local/bin/baysor
83+
- type: native
84+
85+
runners:
86+
- type: executable
87+
- type: nextflow
88+
directives:
89+
label: [ midtime, midcpu, midmem ]
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
import os
2+
import shutil
3+
from pathlib import Path
4+
from tifffile import imwrite
5+
import dask
6+
import numpy as np
7+
import pandas as pd
8+
import anndata as ad
9+
import spatialdata as sd
10+
11+
12+
## VIASH START
13+
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
14+
# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
15+
par = {
16+
'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr',
17+
'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr',
18+
'transcripts_key': 'transcripts',
19+
'coordinate_system': 'global',
20+
'output': './temp/methods/baysor/baysor_assigned_transcripts.zarr',
21+
22+
'force_2d': 'false',
23+
'min_molecules_per_cell': 50,
24+
'scale': -1.0,
25+
'scale_std': "25%",
26+
'n_clusters': 4,
27+
'prior_segmentation_confidence': 0.8,
28+
}
29+
meta = {
30+
'name': 'baysor_transcript_assignment',
31+
'temp_dir': "./temp/methods/baysor"
32+
}
33+
## VIASH END
34+
35+
TMP_DIR = Path(meta["temp_dir"] or "/tmp")
36+
TMP_DIR.mkdir(parents=True, exist_ok=True)
37+
38+
TRANSCRIPTS_CSV = TMP_DIR / "transcripts.csv"
39+
SEGMENTATION_TIF = TMP_DIR / "segmentation.tif"
40+
CONFIG_TOML = TMP_DIR / "config.toml"
41+
BAYSOR_OUTPUT = TMP_DIR / "baysor_output.csv"
42+
43+
44+
# Read input
45+
print('Reading input files', flush=True)
46+
sdata = sd.read_zarr(par['input_ist'])
47+
sdata_segm = sd.read_zarr(par['input_segmentation'])
48+
49+
# Check if coordinate system is available in input data
50+
transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys()
51+
assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
52+
segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys()
53+
assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
54+
55+
# Transform transcript coordinates to the coordinate system
56+
print('Transforming transcripts coordinates', flush=True)
57+
transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system'])
58+
59+
# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates
60+
trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse()
61+
transcripts = sd.transform(transcripts, trans, par['coordinate_system'])
62+
63+
64+
# Write transcripts to csv
65+
print('Writing transcripts to csv', flush=True)
66+
transcripts[['x', 'y', 'z', 'feature_name']].compute().to_csv(TRANSCRIPTS_CSV)
67+
68+
# Write segmentation to tif
69+
print('Writing segmentation to tif', flush=True)
70+
imwrite(SEGMENTATION_TIF, sdata_segm["segmentation"]["scale0"].image.to_numpy())
71+
72+
# Write config to toml
73+
print('Writing config to toml', flush=True)
74+
toml_str = f"""[data]
75+
x = "x"
76+
y = "y"
77+
z = "z"
78+
gene = "feature_name"
79+
force_2d = {par['force_2d']}
80+
min_molecules_per_cell = {int(par['min_molecules_per_cell'])}
81+
exclude_genes = ""
82+
83+
[segmentation]
84+
scale = {float(par['scale'])}
85+
scale_std = "{par['scale_std']}"
86+
n_clusters = {int(par['n_clusters'])}
87+
prior_segmentation_confidence = {float(par['prior_segmentation_confidence'])}
88+
"""
89+
with open(CONFIG_TOML, "w") as toml_file:
90+
toml_file.write(toml_str)
91+
92+
93+
# Run Baysor
94+
print('Running Baysor', flush=True)
95+
baysor_cmd = f"baysor run -c {CONFIG_TOML} -o {BAYSOR_OUTPUT} {TRANSCRIPTS_CSV} {SEGMENTATION_TIF}"
96+
print("\t" + baysor_cmd, flush=True)
97+
os.system(baysor_cmd)
98+
99+
100+
# Read Baysor output
101+
print('Reading Baysor output', flush=True)
102+
df_baysor = pd.read_csv(BAYSOR_OUTPUT)
103+
104+
# Formatting of Baysor output
105+
print('Formatting Baysor output', flush=True)
106+
107+
def convert_str_ids_to_ints(df, file_path_for_error_messages=None):
108+
"""Convert cell ids like "CR4b68f93d8-27" to 27
109+
110+
The file argument is just for creating more informative Error messages.
111+
"""
112+
113+
df = df.copy()
114+
file = file_path_for_error_messages
115+
116+
unique_cell_values = df.loc[~df["cell"].isnull(), "cell"].unique()
117+
unique_types = {type(value) for value in unique_cell_values}
118+
n_cells_pre = len(df.loc[~df["cell"].isnull(),"cell"].unique())
119+
if (len(unique_types) == 1) and (str in unique_types):
120+
df.loc[~df["cell"].isnull(),"cell"] = df.loc[~df["cell"].isnull(),"cell"].apply(lambda i: i.split("-")[-1]).astype(int)
121+
elif (len(unique_types) != 1):
122+
raise ValueError(f"Non NaN values of column 'cell' in file {file} have multiple types: {unique_types}")
123+
n_cells_post = len(df.loc[~df["cell"].isnull(),"cell"].unique())
124+
125+
if n_cells_pre != n_cells_post:
126+
raise ValueError(f"Number of cells changed after conversion to integers, probably baysor used different substrings with same integers for some cells. Check file: {file}")
127+
128+
# Convert nan values to 0 (background)
129+
df.loc[df["cell"].isnull(),"cell"] = 0
130+
131+
132+
return df
133+
134+
df_baysor = convert_str_ids_to_ints(df_baysor, file_path_for_error_messages=BAYSOR_OUTPUT)
135+
136+
137+
# Add cell ids to transcripts
138+
print('Adding cell ids to transcripts', flush=True)
139+
cell_id_dask_series = dask.dataframe.from_dask_array(
140+
dask.array.from_array(
141+
df_baysor['cell'].values, chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute())
142+
),
143+
index=sdata[par['transcripts_key']].index
144+
)
145+
sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series
146+
147+
148+
# Create objects for cells table
149+
print('Creating objects for cells table', flush=True)
150+
#create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id')
151+
unique_cells = np.unique(cell_id_dask_series)
152+
153+
# check if a '0' (noise/background) cell is in cell_id and remove
154+
zero_idx = np.where(unique_cells == 0)
155+
if len(zero_idx[0]): unique_cells=np.delete(unique_cells, zero_idx[0][0])
156+
157+
#transform into pandas series and check
158+
cell_id_col = pd.Series(unique_cells, name='cell_id', index=unique_cells)
159+
assert 0 not in cell_id_col, "Found '0' in cell_id column of assingment output cell matrix"
160+
161+
162+
# Create transcripts only sdata
163+
print('Subsetting to transcripts cell id data', flush=True)
164+
sdata_transcripts_only = sd.SpatialData(
165+
points={
166+
"transcripts": sdata[par['transcripts_key']]
167+
},
168+
tables={
169+
"table": ad.AnnData(
170+
obs=pd.DataFrame(cell_id_col),
171+
var=sdata.tables["table"].var[[]]
172+
)
173+
}
174+
)
175+
176+
# Write output
177+
print('Write transcripts with cell ids', flush=True)
178+
if os.path.exists(par["output"]):
179+
shutil.rmtree(par["output"])
180+
181+
sdata_transcripts_only.write(par['output'])

0 commit comments

Comments
 (0)