Skip to content

Commit 9a88480

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent 4bffcd1 commit 9a88480

22 files changed

+307542
-371
lines changed

analysis_summary.html

Lines changed: 307335 additions & 253 deletions
Large diffs are not rendered by default.

scripts/0_data_creation_5k_nucleus.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,28 +35,23 @@
3535
"""
3636

3737

38-
3938
XENIUM_DATA_DIR = Path(
4039
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
4140
)
4241
SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_seg_nuclei")
43-
SCRNASEQ_FILE = Path(
44-
"data_tidy/Human_CRC/scRNAseq.h5ad"
45-
)
46-
CELLTYPE_COLUMN = "Level1" # change this to your column name
42+
SCRNASEQ_FILE = Path("data_tidy/Human_CRC/scRNAseq.h5ad")
43+
CELLTYPE_COLUMN = "Level1" # change this to your column name
4744
scrnaseq = sc.read(SCRNASEQ_FILE)
4845

4946

50-
5147
# subsample the scRNAseq if needed
5248
# sc.pp.subsample(scrnaseq, 0.1)
5349
# scrnaseq.var_names_make_unique()
5450

5551

5652
# Calculate gene-celltype embeddings from reference data
5753
gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
58-
scrnaseq,
59-
CELLTYPE_COLUMN
54+
scrnaseq, CELLTYPE_COLUMN
6055
)
6156

6257
# Initialize spatial transcriptomics sample object
@@ -65,7 +60,7 @@
6560
n_workers=4,
6661
sample_type="xenium",
6762
weights=gene_celltype_abundance_embedding,
68-
scale_factor=1.
63+
scale_factor=1.0,
6964
)
7065

7166

@@ -77,7 +72,7 @@
7772
dist_tx=5, # Use calculated optimal search radius
7873
tile_size=10000, # Tile size for processing
7974
# tile_height=50,
80-
neg_sampling_ratio=10., # 5:1 negative:positive samples
75+
neg_sampling_ratio=10.0, # 5:1 negative:positive samples
8176
frac=1.0, # Use all data
8277
val_prob=0.3, # 30% validation set
8378
test_prob=0, # No test set

scripts/1_train_5k.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from segger.training.segger_data_module import SeggerDataModule
2+
23
# from segger.prediction.predict import predict, load_model
34
from segger.models.segger_model import Segger
45
from segger.training.train import LitSegger
@@ -9,14 +10,15 @@
910
from lightning.pytorch.plugins.environments import LightningEnvironment
1011
from matplotlib import pyplot as plt
1112
import seaborn as sns
13+
1214
# import pandas as pd
1315
from segger.data.utils import calculate_gene_celltype_abundance_embedding
16+
1417
# import scanpy as sc
1518
import os
1619
from lightning import LightningModule
1720

1821

19-
2022
segger_data_dir = Path("data_tidy/pyg_datasets/human_CRC_seg_cells")
2123
models_dir = Path("./models/human_CRC_seg_cells")
2224

@@ -43,14 +45,18 @@
4345

4446

4547
model = Segger(
46-
num_tx_tokens= num_tx_tokens,
48+
num_tx_tokens=num_tx_tokens,
4749
init_emb=8,
4850
hidden_channels=32,
4951
out_channels=16,
5052
heads=4,
5153
num_mid_layers=3,
5254
)
53-
model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
55+
model = to_hetero(
56+
model,
57+
(["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]),
58+
aggr="sum",
59+
)
5460

5561
batch = dm.train[0]
5662
model.forward(batch.x_dict, batch.edge_index_dict)

scripts/2_predict_5k.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,26 +8,27 @@
88
import dask.dataframe as dd
99
import pandas as pd
1010
from pathlib import Path
11+
1112
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
1213
os.environ["CUPY_CACHE_DIR"] = "./.cupy"
1314

1415

15-
XENIUM_DATA_DIR = Path( #raw data dir
16+
XENIUM_DATA_DIR = Path( # raw data dir
1617
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
1718
)
18-
transcripts_file = (
19-
XENIUM_DATA_DIR / "transcripts.parquet"
20-
)
19+
transcripts_file = XENIUM_DATA_DIR / "transcripts.parquet"
2120

22-
SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_seg_nuclei") # preprocessed data dir
21+
SEGGER_DATA_DIR = Path(
22+
"data_tidy/pyg_datasets/human_CRC_seg_nuclei"
23+
) # preprocessed data dir
2324

2425

2526
seg_tag = "human_CRC_seg_nuclei"
2627
model_version = 0
27-
models_dir = Path("./models") / seg_tag #trained model dir
28+
models_dir = Path("./models") / seg_tag # trained model dir
2829

2930

30-
output_dir = Path( #output dir
31+
output_dir = Path( # output dir
3132
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/human_CRC_seg_nuclei"
3233
)
3334

@@ -58,10 +59,10 @@
5859
min_transcripts=5,
5960
score_cut=0.5,
6061
cell_id_col="segger_cell_id",
61-
save_transcripts= True,
62-
save_anndata= True,
63-
save_cell_masks= False, # Placeholder for future implementation
64-
use_cc=False, # if one wants fragments (groups of similar transcripts not attached to any nuclei)
62+
save_transcripts=True,
63+
save_anndata=True,
64+
save_cell_masks=False, # Placeholder for future implementation
65+
use_cc=False, # if one wants fragments (groups of similar transcripts not attached to any nuclei)
6566
knn_method="kd_tree",
6667
verbose=True,
6768
gpu_ids=["0"],

scripts/create_data_cosmx.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@
6969
)
7070

7171

72-
cells = list(set(transcript_counts.index) & set(nucleus_polygons.index))
72+
cells = list(set(transcript_counts.index) & set(nucleus_polygons.index))
7373
nucleus_polygons = nucleus_polygons[cells]
7474
transcript_counts = transcript_counts[cells]
7575

scripts/create_data_fast_sample.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,17 +42,14 @@
4242
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
4343
)
4444
SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_seg_nuclei")
45-
SCRNASEQ_FILE = Path(
46-
"data_tidy/Human_CRC/scRNAseq.h5ad"
47-
)
45+
SCRNASEQ_FILE = Path("data_tidy/Human_CRC/scRNAseq.h5ad")
4846
CELLTYPE_COLUMN = "Level1"
4947
scrnaseq = sc.read(SCRNASEQ_FILE)
5048
sc.pp.subsample(scrnaseq, 0.1)
5149
scrnaseq.var_names_make_unique()
5250
# Calculate gene-celltype embeddings from reference data
5351
gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
54-
scrnaseq,
55-
CELLTYPE_COLUMN
52+
scrnaseq, CELLTYPE_COLUMN
5653
)
5754

5855
# Initialize spatial transcriptomics sample object
@@ -61,7 +58,7 @@
6158
n_workers=4,
6259
sample_type="xenium",
6360
# scale_factor=0.8,
64-
weights=gene_celltype_abundance_embedding
61+
weights=gene_celltype_abundance_embedding,
6562
)
6663

6764
# # Load and filter datas

scripts/create_data_merscope.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@
3838
# CELLTYPE_COLUMN = 'celltype_minor'
3939

4040

41-
MERSCOPE_DATA_DIR = Path('data_raw/merscope/processed/')
42-
SEGGER_DATA_DIR = Path('data_tidy/pyg_datasets/merscope_liver')
41+
MERSCOPE_DATA_DIR = Path("data_raw/merscope/processed/")
42+
SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/merscope_liver")
4343
# SCRNASEQ_FILE = Path('/omics/groups/OE0606/internal/mimmo/MERSCOPE/notebooks/data/scData/bh/bh_mng_scdata_20250306.h5ad')
4444
# CELLTYPE_COLUMN = 'annot_v1'
4545

@@ -80,4 +80,4 @@
8080
frac=1.0, # Use all data
8181
val_prob=0.3, # 30% validation set
8282
test_prob=0, # No test set
83-
)
83+
)

scripts/predict_model_sample.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,10 @@
1717
import dask.dataframe as dd
1818

1919

20-
2120
seg_tag = "human_CRC_seg_cells"
2221
model_version = 0
2322

2423

25-
2624
XENIUM_DATA_DIR = Path(
2725
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
2826
)
@@ -32,9 +30,7 @@
3230
benchmarks_dir = Path(
3331
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/human_CRC_seg_cells"
3432
)
35-
transcripts_file = (
36-
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real/transcripts.parquet"
37-
)
33+
transcripts_file = "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real/transcripts.parquet"
3834
# Initialize the Lightning data module
3935
dm = SeggerDataModule(
4036
data_dir=SEGGER_DATA_DIR,

scripts/train_cosmx.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from segger.training.segger_data_module import SeggerDataModule
2+
23
# from segger.prediction.predict import predict, load_model
34
from segger.models.segger_model import Segger
45
from segger.training.train import LitSegger
@@ -9,14 +10,15 @@
910
from lightning.pytorch.plugins.environments import LightningEnvironment
1011
from matplotlib import pyplot as plt
1112
import seaborn as sns
13+
1214
# import pandas as pd
1315
from segger.data.utils import calculate_gene_celltype_abundance_embedding
16+
1417
# import scanpy as sc
1518
import os
1619
from lightning import LightningModule
1720

1821

19-
2022
segger_data_dir = Path("data_tidy/pyg_datasets/cosmx_pancreas_degbugged")
2123
models_dir = Path("./models/cosmx_pancreas")
2224

@@ -50,7 +52,11 @@
5052
heads=4,
5153
num_mid_layers=3,
5254
)
53-
model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
55+
model = to_hetero(
56+
model,
57+
(["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]),
58+
aggr="sum",
59+
)
5460

5561
batch = dm.train[0]
5662
model.forward(batch.x_dict, batch.edge_index_dict)

scripts/train_mimmo_batch.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from segger.training.segger_data_module import SeggerDataModule
2+
23
# from segger.prediction.predict import predict, load_model
34
from segger.models.segger_model import Segger
45
from segger.training.train import LitSegger
@@ -9,16 +10,21 @@
910
from lightning.pytorch.plugins.environments import LightningEnvironment
1011
from matplotlib import pyplot as plt
1112
import seaborn as sns
13+
1214
# import pandas as pd
1315
from segger.data.utils import calculate_gene_celltype_abundance_embedding
16+
1417
# import scanpy as sc
1518
import os
1619
from lightning import LightningModule
1720

1821

19-
20-
segger_data_dir = Path("data_tidy/pyg_datasets/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740")
21-
models_dir = Path("./models/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740")
22+
segger_data_dir = Path(
23+
"data_tidy/pyg_datasets/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740"
24+
)
25+
models_dir = Path(
26+
"./models/project24_MNG/output-XETG00423__0042861__mng_07_TMA__20250303__153740"
27+
)
2228

2329
# Base directory to store Pytorch Lightning models
2430
# models_dir = Path('models')
@@ -44,14 +50,18 @@
4450

4551
model = Segger(
4652
# is_token_based=is_token_based,
47-
num_tx_tokens= num_tx_tokens,
53+
num_tx_tokens=num_tx_tokens,
4854
init_emb=8,
4955
hidden_channels=32,
5056
out_channels=16,
5157
heads=4,
5258
num_mid_layers=3,
5359
)
54-
model = to_hetero(model, (["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]), aggr="sum")
60+
model = to_hetero(
61+
model,
62+
(["tx", "bd"], [("tx", "belongs", "bd"), ("tx", "neighbors", "tx")]),
63+
aggr="sum",
64+
)
5565

5666
batch = dm.train[0]
5767
model.forward(batch.x_dict, batch.edge_index_dict)

0 commit comments

Comments
 (0)