22from path import Path
33from segger .data .utils import calculate_gene_celltype_abundance_embedding
44import scanpy as sc
5+ import pandas as pd
6+ import math
7+ import numpy as np
8+ from segger .data .parquet ._utils import get_polygons_from_xy
59
610xenium_data_dir = Path ('data_raw/breast_cancer/Xenium_FFPE_Human_Breast_Cancer_Rep1/outs/' )
7- segger_data_dir = Path ('data_tidy/pyg_datasets/bc_fast_data_emb_major ' )
11+ segger_data_dir = Path ('data_tidy/pyg_datasets/bc_rep1_emb ' )
812
913
10- scrnaseq_file = Path ('data_tidy/benchmarks/xe_rep1_bc/scRNAseq .h5ad' )
14+ scrnaseq_file = Path ('/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered .h5ad' )
1115celltype_column = 'celltype_major'
1216gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding (
1317 sc .read (scrnaseq_file ),
2125 weights = gene_celltype_abundance_embedding , # uncomment if gene-celltype embeddings are available
2226)
2327
28+ transcripts = pd .read_parquet (
29+ xenium_data_dir / 'transcripts.parquet' ,
30+ filters = [[('overlaps_nucleus' , '=' , 1 )]]
31+ )
32+ boundaries = pd .read_parquet (xenium_data_dir / 'nucleus_boundaries.parquet' )
33+
34+ sizes = transcripts .groupby ('cell_id' ).size ()
35+ polygons = get_polygons_from_xy (boundaries , 'vertex_x' , 'vertex_y' , 'cell_id' )
36+ densities = polygons [sizes .index ].area / sizes
37+ bd_width = polygons .minimum_bounding_radius ().median () * 2
38+
39+ # 1/4 median boundary diameter
40+ dist_tx = bd_width / 4
41+ # 90th percentile density of bounding circle with radius=dist_tx
42+ k_tx = math .ceil (np .quantile (dist_tx ** 2 * np .pi * densities , 0.9 ))
43+
44+ print (k_tx )
45+ print (dist_tx )
46+
47+
2448sample .save (
2549 data_dir = segger_data_dir ,
2650 k_bd = 3 ,
2751 dist_bd = 15.0 ,
28- k_tx = 20 ,
29- dist_tx = 3 ,
30- tile_width = 220 ,
31- tile_height = 220 ,
52+ k_tx = dist_tx ,
53+ dist_tx = k_tx ,
54+ tile_width = 120 ,
55+ tile_height = 120 ,
3256 neg_sampling_ratio = 5.0 ,
3357 frac = 1.0 ,
3458 val_prob = 0.1 ,
3559 test_prob = 0.1 ,
36- )
60+ )
61+
62+
63+ xenium_data_dir = Path ('data_tidy/bc_5k' )
64+ segger_data_dir = Path ('data_tidy/pyg_datasets/bc_5k_emb' )
65+
66+
67+
68+ sample = STSampleParquet (
69+ base_dir = xenium_data_dir ,
70+ n_workers = 1 ,
71+ sample_type = 'xenium' ,
72+ weights = gene_celltype_abundance_embedding , # uncomment if gene-celltype embeddings are available
73+ )
74+
75+
76+ transcripts = pd .read_parquet (
77+ xenium_data_dir / 'transcripts.parquet' ,
78+ filters = [[('overlaps_nucleus' , '=' , 1 )]]
79+ )
80+ boundaries = pd .read_parquet (xenium_data_dir / 'nucleus_boundaries.parquet' )
81+
82+ sizes = transcripts .groupby ('cell_id' ).size ()
83+ polygons = get_polygons_from_xy (boundaries , 'vertex_x' , 'vertex_y' , 'cell_id' )
84+ densities = polygons [sizes .index ].area / sizes
85+ bd_width = polygons .minimum_bounding_radius ().median () * 2
86+
87+ # 1/4 median boundary diameter
88+ dist_tx = bd_width / 4
89+ # 90th percentile density of bounding circle with radius=dist_tx
90+ k_tx = math .ceil (np .quantile (dist_tx ** 2 * np .pi * densities , 0.9 ))
91+
92+ print (k_tx )
93+ print (dist_tx )
0 commit comments