1+ from segger .data .parquet .sample import STSampleParquet , STInMemoryDataset
2+ from path import Path
3+ from segger .data .utils import calculate_gene_celltype_abundance_embedding
4+ import scanpy as sc
5+ import pandas as pd
6+ import math
7+ import numpy as np
8+ from segger .data .parquet ._utils import get_polygons_from_xy
9+
10+ """
11+ This script preprocesses MERSCOPE spatial transcriptomics data for SEGGER cell segmentation model.
12+
13+ Key steps:
14+ 1. Data Loading:
15+ - Loads scRNA-seq reference data to create gene-celltype embeddings
16+ - Imports MERSCOPE transcripts and nucleus boundaries
17+
18+ 2. Parameter Optimization:
19+ - Calculates optimal neighborhood parameters based on tissue characteristics
20+ - dist_tx: Sets transcript neighbor search radius to 1/4 of typical nucleus size
21+ - k_tx: Determines number of transcripts to sample based on local density
22+
23+ 3. Dataset Creation:
24+ - Filters transcripts to those overlapping nuclei
25+ - Creates graph connections between nearby transcripts
26+ - Splits data into training/validation sets
27+ - Saves in PyG format for SEGGER training
28+
29+ Usage:
30+ - Input: Raw MERSCOPE data (transcripts.parquet, nucleus_boundaries.parquet)
31+ - Output: Processed dataset with graph structure and embeddings
32+ """
33+
34+ # Define data paths
35+ # MERSCOPE_DATA_DIR = Path('/omics/odcf/analysis/OE0606_projects_temp/MERSCOPE_projects/20241209_MERSCOPE5k_CNSL_BrM/20241209_MERSCOPE5k_CNSL_BrM/output-XETG00078__0041719__Region_1__20241203__142052')
36+ # SEGGER_DATA_DIR = Path('data_tidy/pyg_datasets/CNSL_5k')
37+ # # SCRNASEQ_FILE = Path('/omics/groups/OE0606/internal/tangy/tasks/schier/data/atals_filtered.h5ad')
38+ # CELLTYPE_COLUMN = 'celltype_minor'
39+
40+
41+ MERSCOPE_DATA_DIR = Path ('data_raw/merscope/processed/' )
42+ SEGGER_DATA_DIR = Path ('data_tidy/pyg_datasets/merscope_liver' )
43+ # SCRNASEQ_FILE = Path('/omics/groups/OE0606/internal/mimmo/MERSCOPE/notebooks/data/scData/bh/bh_mng_scdata_20250306.h5ad')
44+ # CELLTYPE_COLUMN = 'annot_v1'
45+
46+ # Calculate gene-celltype embeddings from reference data
47+ # gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
48+ # sc.read(SCRNASEQ_FILE),
49+ # CELLTYPE_COLUMN
50+ # )
51+
52+ # Initialize spatial transcriptomics sample object
53+ sample = STSampleParquet (
54+ base_dir = MERSCOPE_DATA_DIR ,
55+ n_workers = 4 ,
56+ sample_type = "merscope" ,
57+ buffer_ratio = 1 ,
58+ # weights=gene_celltype_abundance_embedding
59+ )
60+
61+ # Load and filter data
62+
63+
64+ # Save processed dataset for SEGGER
65+ # Parameters:
66+ # - k_bd/dist_bd: Control nucleus boundary point connections
67+ # - k_tx/dist_tx: Control transcript neighborhood connections
68+ # - tile_width/height: Size of spatial tiles for processing
69+ # - neg_sampling_ratio: Ratio of negative to positive samples
70+ # - val_prob: Fraction of data for validation
71+ sample .save_debug (
72+ data_dir = SEGGER_DATA_DIR ,
73+ k_bd = 3 , # Number of boundary points to connect
74+ dist_bd = 15 , # Maximum distance for boundary connections
75+ k_tx = 5 , # Use calculated optimal transcript neighbors
76+ dist_tx = 20 , # Use calculated optimal search radius
77+ tile_width = 500 , # Tile size for processing
78+ tile_height = 500 ,
79+ neg_sampling_ratio = 5.0 , # 5:1 negative:positive samples
80+ frac = 1.0 , # Use all data
81+ val_prob = 0.3 , # 30% validation set
82+ test_prob = 0 , # No test set
83+ )
0 commit comments