Skip to content

Commit e7b89a4

Browse files
Merge pull request #77 from daniel-unyi-42/main
Update tutorial notebook
2 parents ed570a5 + 7e6b4d8 commit e7b89a4

File tree

9 files changed

+1035
-1084
lines changed

9 files changed

+1035
-1084
lines changed

README.md

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,8 @@
22

33
[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/EliHei2/segger_dev/main.svg)](https://results.pre-commit.ci/latest/github/EliHei2/segger_dev/main)
44

5-
6-
75
**Important note (Dec 2024)**: As segger is currently undergoing constant development we highly recommending installing segger directly via github.
86

9-
107
**segger** is a cutting-edge tool for **cell segmentation** in **single-molecule spatial omics** datasets. By leveraging **graph neural networks (GNNs)** and heterogeneous graphs, segger offers unmatched accuracy and scalability.
118

129
# How segger Works
@@ -52,7 +49,7 @@ segger tackles these with a **graph-based approach**, achieving superior segment
5249

5350
---
5451

55-
## Installation
52+
## Installation
5653

5754
**Important note (Dec 2024)**: As segger is currently undergoing constant development we highly recommending installing segger directly via github.
5855

@@ -78,7 +75,6 @@ pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -
7875

7976
Afterwards choose the installation method that best suits your needs.
8077

81-
8278
### GitHub Installation
8379

8480
For a straightforward local installation from GitHub, clone the repository and install the package using `pip`:

docs/notebooks/segger_tutorial.ipynb

Lines changed: 914 additions & 934 deletions
Large diffs are not rendered by default.

scripts/create_data_fast_sample.py

Lines changed: 32 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -7,87 +7,77 @@
77
import numpy as np
88
from segger.data.parquet._utils import get_polygons_from_xy
99

10-
xenium_data_dir = Path('data_raw/breast_cancer/Xenium_FFPE_Human_Breast_Cancer_Rep1/outs/')
11-
segger_data_dir = Path('data_tidy/pyg_datasets/bc_rep1_emb')
10+
xenium_data_dir = Path("data_raw/breast_cancer/Xenium_FFPE_Human_Breast_Cancer_Rep1/outs/")
11+
segger_data_dir = Path("data_tidy/pyg_datasets/bc_rep1_emb")
1212

1313

14-
scrnaseq_file = Path('/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad')
15-
celltype_column = 'celltype_major'
16-
gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
17-
sc.read(scrnaseq_file),
18-
celltype_column
19-
)
14+
scrnaseq_file = Path("/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad")
15+
celltype_column = "celltype_major"
16+
gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(sc.read(scrnaseq_file), celltype_column)
2017

2118
sample = STSampleParquet(
2219
base_dir=xenium_data_dir,
2320
n_workers=4,
24-
sample_type='xenium',
25-
weights=gene_celltype_abundance_embedding, # uncomment if gene-celltype embeddings are available
21+
sample_type="xenium",
22+
weights=gene_celltype_abundance_embedding, # uncomment if gene-celltype embeddings are available
2623
)
2724

28-
transcripts = pd.read_parquet(
29-
xenium_data_dir / 'transcripts.parquet',
30-
filters=[[('overlaps_nucleus', '=', 1)]]
31-
)
32-
boundaries = pd.read_parquet(xenium_data_dir / 'nucleus_boundaries.parquet')
25+
transcripts = pd.read_parquet(xenium_data_dir / "transcripts.parquet", filters=[[("overlaps_nucleus", "=", 1)]])
26+
boundaries = pd.read_parquet(xenium_data_dir / "nucleus_boundaries.parquet")
3327

34-
sizes = transcripts.groupby('cell_id').size()
35-
polygons = get_polygons_from_xy(boundaries, 'vertex_x', 'vertex_y', 'cell_id')
28+
sizes = transcripts.groupby("cell_id").size()
29+
polygons = get_polygons_from_xy(boundaries, "vertex_x", "vertex_y", "cell_id")
3630
densities = polygons[sizes.index].area / sizes
3731
bd_width = polygons.minimum_bounding_radius().median() * 2
3832

3933
# 1/4 median boundary diameter
4034
dist_tx = bd_width / 4
4135
# 90th percentile density of bounding circle with radius=dist_tx
42-
k_tx = math.ceil(np.quantile(dist_tx ** 2 * np.pi * densities, 0.9))
36+
k_tx = math.ceil(np.quantile(dist_tx**2 * np.pi * densities, 0.9))
4337

4438
print(k_tx)
4539
print(dist_tx)
4640

4741

4842
sample.save(
49-
data_dir=segger_data_dir,
50-
k_bd=3,
51-
dist_bd=15.0,
52-
k_tx=dist_tx,
53-
dist_tx=k_tx,
54-
tile_width=120,
55-
tile_height=120,
56-
neg_sampling_ratio=5.0,
57-
frac=1.0,
58-
val_prob=0.1,
59-
test_prob=0.1,
43+
data_dir=segger_data_dir,
44+
k_bd=3,
45+
dist_bd=15.0,
46+
k_tx=dist_tx,
47+
dist_tx=k_tx,
48+
tile_width=120,
49+
tile_height=120,
50+
neg_sampling_ratio=5.0,
51+
frac=1.0,
52+
val_prob=0.1,
53+
test_prob=0.1,
6054
)
6155

6256

63-
xenium_data_dir = Path('data_tidy/bc_5k')
64-
segger_data_dir = Path('data_tidy/pyg_datasets/bc_5k_emb')
65-
57+
xenium_data_dir = Path("data_tidy/bc_5k")
58+
segger_data_dir = Path("data_tidy/pyg_datasets/bc_5k_emb")
6659

6760

6861
sample = STSampleParquet(
6962
base_dir=xenium_data_dir,
7063
n_workers=1,
71-
sample_type='xenium',
72-
weights=gene_celltype_abundance_embedding, # uncomment if gene-celltype embeddings are available
64+
sample_type="xenium",
65+
weights=gene_celltype_abundance_embedding, # uncomment if gene-celltype embeddings are available
7366
)
7467

7568

76-
transcripts = pd.read_parquet(
77-
xenium_data_dir / 'transcripts.parquet',
78-
filters=[[('overlaps_nucleus', '=', 1)]]
79-
)
80-
boundaries = pd.read_parquet(xenium_data_dir / 'nucleus_boundaries.parquet')
69+
transcripts = pd.read_parquet(xenium_data_dir / "transcripts.parquet", filters=[[("overlaps_nucleus", "=", 1)]])
70+
boundaries = pd.read_parquet(xenium_data_dir / "nucleus_boundaries.parquet")
8171

82-
sizes = transcripts.groupby('cell_id').size()
83-
polygons = get_polygons_from_xy(boundaries, 'vertex_x', 'vertex_y', 'cell_id')
72+
sizes = transcripts.groupby("cell_id").size()
73+
polygons = get_polygons_from_xy(boundaries, "vertex_x", "vertex_y", "cell_id")
8474
densities = polygons[sizes.index].area / sizes
8575
bd_width = polygons.minimum_bounding_radius().median() * 2
8676

8777
# 1/4 median boundary diameter
8878
dist_tx = bd_width / 4
8979
# 90th percentile density of bounding circle with radius=dist_tx
90-
k_tx = math.ceil(np.quantile(dist_tx ** 2 * np.pi * densities, 0.9))
80+
k_tx = math.ceil(np.quantile(dist_tx**2 * np.pi * densities, 0.9))
9181

9282
print(k_tx)
9383
print(dist_tx)

scripts/predict_model_sample.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,11 @@
1616
import dask.dataframe as dd
1717

1818

19-
2019
seg_tag = "bc_fast_data_emb_major"
2120
model_version = 1
2221

23-
segger_data_dir = Path('data_tidy/pyg_datasets') / seg_tag
24-
models_dir = Path("./models") / seg_tag
22+
segger_data_dir = Path("data_tidy/pyg_datasets") / seg_tag
23+
models_dir = Path("./models") / seg_tag
2524
benchmarks_dir = Path("/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc")
2625
transcripts_file = "data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1/transcripts.parquet"
2726
# Initialize the Lightning data module
@@ -58,4 +57,3 @@
5857
gpu_ids=["0"],
5958
# client=client
6059
)
61-

0 commit comments

Comments
 (0)