|
7 | 7 | import numpy as np |
8 | 8 | from segger.data.parquet._utils import get_polygons_from_xy |
9 | 9 |
|
10 | | -xenium_data_dir = Path('data_raw/breast_cancer/Xenium_FFPE_Human_Breast_Cancer_Rep1/outs/') |
11 | | -segger_data_dir = Path('data_tidy/pyg_datasets/bc_rep1_emb') |
| 10 | +xenium_data_dir = Path("data_raw/breast_cancer/Xenium_FFPE_Human_Breast_Cancer_Rep1/outs/") |
| 11 | +segger_data_dir = Path("data_tidy/pyg_datasets/bc_rep1_emb") |
12 | 12 |
|
13 | 13 |
|
14 | | -scrnaseq_file = Path('/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad') |
15 | | -celltype_column = 'celltype_major' |
16 | | -gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding( |
17 | | - sc.read(scrnaseq_file), |
18 | | - celltype_column |
19 | | -) |
| 14 | +scrnaseq_file = Path("/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad") |
| 15 | +celltype_column = "celltype_major" |
| 16 | +gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(sc.read(scrnaseq_file), celltype_column) |
20 | 17 |
|
21 | 18 | sample = STSampleParquet( |
22 | 19 | base_dir=xenium_data_dir, |
23 | 20 | n_workers=4, |
24 | | - sample_type='xenium', |
25 | | - weights=gene_celltype_abundance_embedding, # uncomment if gene-celltype embeddings are available |
| 21 | + sample_type="xenium", |
| 22 | + weights=gene_celltype_abundance_embedding, # uncomment if gene-celltype embeddings are available |
26 | 23 | ) |
27 | 24 |
|
28 | | -transcripts = pd.read_parquet( |
29 | | - xenium_data_dir / 'transcripts.parquet', |
30 | | - filters=[[('overlaps_nucleus', '=', 1)]] |
31 | | -) |
32 | | -boundaries = pd.read_parquet(xenium_data_dir / 'nucleus_boundaries.parquet') |
| 25 | +transcripts = pd.read_parquet(xenium_data_dir / "transcripts.parquet", filters=[[("overlaps_nucleus", "=", 1)]]) |
| 26 | +boundaries = pd.read_parquet(xenium_data_dir / "nucleus_boundaries.parquet") |
33 | 27 |
|
34 | | -sizes = transcripts.groupby('cell_id').size() |
35 | | -polygons = get_polygons_from_xy(boundaries, 'vertex_x', 'vertex_y', 'cell_id') |
| 28 | +sizes = transcripts.groupby("cell_id").size() |
| 29 | +polygons = get_polygons_from_xy(boundaries, "vertex_x", "vertex_y", "cell_id") |
36 | 30 | densities = polygons[sizes.index].area / sizes |
37 | 31 | bd_width = polygons.minimum_bounding_radius().median() * 2 |
38 | 32 |
|
39 | 33 | # 1/4 median boundary diameter |
40 | 34 | dist_tx = bd_width / 4 |
41 | 35 | # 90th percentile density of bounding circle with radius=dist_tx |
42 | | -k_tx = math.ceil(np.quantile(dist_tx ** 2 * np.pi * densities, 0.9)) |
| 36 | +k_tx = math.ceil(np.quantile(dist_tx**2 * np.pi * densities, 0.9)) |
43 | 37 |
|
44 | 38 | print(k_tx) |
45 | 39 | print(dist_tx) |
46 | 40 |
|
47 | 41 |
|
48 | 42 | sample.save( |
49 | | - data_dir=segger_data_dir, |
50 | | - k_bd=3, |
51 | | - dist_bd=15.0, |
52 | | - k_tx=dist_tx, |
53 | | - dist_tx=k_tx, |
54 | | - tile_width=120, |
55 | | - tile_height=120, |
56 | | - neg_sampling_ratio=5.0, |
57 | | - frac=1.0, |
58 | | - val_prob=0.1, |
59 | | - test_prob=0.1, |
| 43 | + data_dir=segger_data_dir, |
| 44 | + k_bd=3, |
| 45 | + dist_bd=15.0, |
| 46 | + k_tx=dist_tx, |
| 47 | + dist_tx=k_tx, |
| 48 | + tile_width=120, |
| 49 | + tile_height=120, |
| 50 | + neg_sampling_ratio=5.0, |
| 51 | + frac=1.0, |
| 52 | + val_prob=0.1, |
| 53 | + test_prob=0.1, |
60 | 54 | ) |
61 | 55 |
|
62 | 56 |
|
63 | | -xenium_data_dir = Path('data_tidy/bc_5k') |
64 | | -segger_data_dir = Path('data_tidy/pyg_datasets/bc_5k_emb') |
65 | | - |
| 57 | +xenium_data_dir = Path("data_tidy/bc_5k") |
| 58 | +segger_data_dir = Path("data_tidy/pyg_datasets/bc_5k_emb") |
66 | 59 |
|
67 | 60 |
|
68 | 61 | sample = STSampleParquet( |
69 | 62 | base_dir=xenium_data_dir, |
70 | 63 | n_workers=1, |
71 | | - sample_type='xenium', |
72 | | - weights=gene_celltype_abundance_embedding, # uncomment if gene-celltype embeddings are available |
| 64 | + sample_type="xenium", |
| 65 | + weights=gene_celltype_abundance_embedding, # uncomment if gene-celltype embeddings are available |
73 | 66 | ) |
74 | 67 |
|
75 | 68 |
|
76 | | -transcripts = pd.read_parquet( |
77 | | - xenium_data_dir / 'transcripts.parquet', |
78 | | - filters=[[('overlaps_nucleus', '=', 1)]] |
79 | | -) |
80 | | -boundaries = pd.read_parquet(xenium_data_dir / 'nucleus_boundaries.parquet') |
| 69 | +transcripts = pd.read_parquet(xenium_data_dir / "transcripts.parquet", filters=[[("overlaps_nucleus", "=", 1)]]) |
| 70 | +boundaries = pd.read_parquet(xenium_data_dir / "nucleus_boundaries.parquet") |
81 | 71 |
|
82 | | -sizes = transcripts.groupby('cell_id').size() |
83 | | -polygons = get_polygons_from_xy(boundaries, 'vertex_x', 'vertex_y', 'cell_id') |
| 72 | +sizes = transcripts.groupby("cell_id").size() |
| 73 | +polygons = get_polygons_from_xy(boundaries, "vertex_x", "vertex_y", "cell_id") |
84 | 74 | densities = polygons[sizes.index].area / sizes |
85 | 75 | bd_width = polygons.minimum_bounding_radius().median() * 2 |
86 | 76 |
|
87 | 77 | # 1/4 median boundary diameter |
88 | 78 | dist_tx = bd_width / 4 |
89 | 79 | # 90th percentile density of bounding circle with radius=dist_tx |
90 | | -k_tx = math.ceil(np.quantile(dist_tx ** 2 * np.pi * densities, 0.9)) |
| 80 | +k_tx = math.ceil(np.quantile(dist_tx**2 * np.pi * densities, 0.9)) |
91 | 81 |
|
92 | 82 | print(k_tx) |
93 | 83 | print(dist_tx) |
0 commit comments