Skip to content

Commit ed570a5

Browse files
committed
create_sample
1 parent 584b9fc commit ed570a5

File tree

1 file changed

+64
-7
lines changed

1 file changed

+64
-7
lines changed

scripts/create_data_fast_sample.py

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22
from path import Path
33
from segger.data.utils import calculate_gene_celltype_abundance_embedding
44
import scanpy as sc
5+
import pandas as pd
6+
import math
7+
import numpy as np
8+
from segger.data.parquet._utils import get_polygons_from_xy
59

610
xenium_data_dir = Path('data_raw/breast_cancer/Xenium_FFPE_Human_Breast_Cancer_Rep1/outs/')
7-
segger_data_dir = Path('data_tidy/pyg_datasets/bc_fast_data_emb_major')
11+
segger_data_dir = Path('data_tidy/pyg_datasets/bc_rep1_emb')
812

913

10-
scrnaseq_file = Path('data_tidy/benchmarks/xe_rep1_bc/scRNAseq.h5ad')
14+
scrnaseq_file = Path('/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad')
1115
celltype_column = 'celltype_major'
1216
gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
1317
sc.read(scrnaseq_file),
@@ -21,16 +25,69 @@
2125
weights=gene_celltype_abundance_embedding, # uncomment if gene-celltype embeddings are available
2226
)
2327

28+
transcripts = pd.read_parquet(
29+
xenium_data_dir / 'transcripts.parquet',
30+
filters=[[('overlaps_nucleus', '=', 1)]]
31+
)
32+
boundaries = pd.read_parquet(xenium_data_dir / 'nucleus_boundaries.parquet')
33+
34+
sizes = transcripts.groupby('cell_id').size()
35+
polygons = get_polygons_from_xy(boundaries, 'vertex_x', 'vertex_y', 'cell_id')
36+
densities = polygons[sizes.index].area / sizes
37+
bd_width = polygons.minimum_bounding_radius().median() * 2
38+
39+
# 1/4 median boundary diameter
40+
dist_tx = bd_width / 4
41+
# 90th percentile density of bounding circle with radius=dist_tx
42+
k_tx = math.ceil(np.quantile(dist_tx ** 2 * np.pi * densities, 0.9))
43+
44+
print(k_tx)
45+
print(dist_tx)
46+
47+
2448
sample.save(
2549
data_dir=segger_data_dir,
2650
k_bd=3,
2751
dist_bd=15.0,
28-
k_tx=20,
29-
dist_tx=3,
30-
tile_width=220,
31-
tile_height=220,
52+
k_tx=dist_tx,
53+
dist_tx=k_tx,
54+
tile_width=120,
55+
tile_height=120,
3256
neg_sampling_ratio=5.0,
3357
frac=1.0,
3458
val_prob=0.1,
3559
test_prob=0.1,
36-
)
60+
)
61+
62+
63+
xenium_data_dir = Path('data_tidy/bc_5k')
64+
segger_data_dir = Path('data_tidy/pyg_datasets/bc_5k_emb')
65+
66+
67+
68+
sample = STSampleParquet(
69+
base_dir=xenium_data_dir,
70+
n_workers=1,
71+
sample_type='xenium',
72+
weights=gene_celltype_abundance_embedding, # uncomment if gene-celltype embeddings are available
73+
)
74+
75+
76+
transcripts = pd.read_parquet(
77+
xenium_data_dir / 'transcripts.parquet',
78+
filters=[[('overlaps_nucleus', '=', 1)]]
79+
)
80+
boundaries = pd.read_parquet(xenium_data_dir / 'nucleus_boundaries.parquet')
81+
82+
sizes = transcripts.groupby('cell_id').size()
83+
polygons = get_polygons_from_xy(boundaries, 'vertex_x', 'vertex_y', 'cell_id')
84+
densities = polygons[sizes.index].area / sizes
85+
bd_width = polygons.minimum_bounding_radius().median() * 2
86+
87+
# 1/4 median boundary diameter
88+
dist_tx = bd_width / 4
89+
# 90th percentile density of bounding circle with radius=dist_tx
90+
k_tx = math.ceil(np.quantile(dist_tx ** 2 * np.pi * densities, 0.9))
91+
92+
print(k_tx)
93+
print(dist_tx)

0 commit comments

Comments
 (0)