Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
4 changes: 2 additions & 2 deletions scripts/0_data_creation_5k_nucleus.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@


# subsample the scRNAseq if needed
# sc.pp.subsample(scrnaseq, 0.1)
# scrnaseq.var_names_make_unique()
sc.pp.subsample(scrnaseq, 0.1)
scrnaseq.var_names_make_unique()


# Calculate gene-celltype embeddings from reference data
Expand Down
11 changes: 6 additions & 5 deletions scripts/1_train_5k.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@



segger_data_dir = Path("data_tidy/pyg_datasets/human_CRC_seg_cells")
models_dir = Path("./models/human_CRC_seg_cells")
segger_data_dir = Path("data_tidy/pyg_datasets/MNG_5k_sampled/output-XETG00078__0041719__Region_2__20241203__142052/")
models_dir = Path("./models/MNG_5k_sampled/output-XETG00078__0041719__Region_2__20241203__142052/")

# Base directory to store Pytorch Lightning models
# models_dir = Path('models')

# Initialize the Lightning data module
dm = SeggerDataModule(
data_dir=segger_data_dir,
batch_size=2,
num_workers=2,
batch_size=3,
num_workers=3,
)

dm.setup()
Expand All @@ -43,6 +43,7 @@


model = Segger(
# is_token_based=is_token_based,s
num_tx_tokens= num_tx_tokens,
init_emb=8,
hidden_channels=32,
Expand All @@ -64,7 +65,7 @@
strategy="auto",
precision="32",
devices=4, # set higher number if more gpus are available
max_epochs=150,
max_epochs=250,
default_root_dir=models_dir,
logger=CSVLogger(models_dir),
)
Expand Down
41 changes: 41 additions & 0 deletions scripts/4_xenium_explorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@


from segger.validation.xenium_explorer import seg2explorer
import pandas as pd


transcripts_file = 'data_tidy/benchmarks/human_CRC_seg_nuclei/human_CRC_seg_nuclei_0.4_False_4_15_5_3_20250521/segger_transcripts.parquet'
transcripts_df = pd.read_parquet(transcripts_file)
# transcripts_df = transcripts_df.iloc[:10000]
seg2explorer(
seg_df=transcripts_df, # this is your segger output
source_path="/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real", #raw xenium data
output_dir="data_tidy/explorer/human_CRC_real_nuclei", #where you wanna save your xneium explorer file, could be the same as raw
cells_filename="seg_cells1", #file names for cells.zarr
analysis_filename="seg_analysis1", #file names for analysis.zarr
xenium_filename="seg_experiment1.xenium", #xenium explorer file
analysis_df=None,
cell_id_columns="segger_cell_id", # segger cell id column in transcripts_df
)



XENIUM_DATA_DIR = Path( #raw data dir
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
)
transcripts_file = (
XENIUM_DATA_DIR / "transcripts.parquet"
)

SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_seg_nuclei") # preprocessed data dir


seg_tag = "human_CRC_seg_nuclei"
model_version = 0
models_dir = Path("./models") / seg_tag #trained model dir


output_dir = Path( #output dir
"/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/human_CRC_seg_nuclei"
)

Binary file added scripts/batch_run_xenium.zip
Binary file not shown.
84 changes: 84 additions & 0 deletions scripts/batch_run_xenium/create_data_batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python
from pathlib import Path
from segger.data.parquet.sample import STSampleParquet
from segger.data.utils import calculate_gene_celltype_abundance_embedding
import scanpy as sc
import pandas as pd
import numpy as np
import argparse
import os
from pqdm.processes import pqdm
from tqdm import tqdm
from segger.data.parquet._utils import find_markers, find_mutually_exclusive_genes

def main():
# Set up argument parser
parser = argparse.ArgumentParser(description='Preprocess Xenium sample for segger')
parser.add_argument('--sample_id', type=str, required=True, help='Xenium sample ID to process')
parser.add_argument('--project_dir', type=str, required=True, help='Base directory containing Xenium samples')
parser.add_argument('--scrna_file', type=str, required=True, help='Path to scRNA-seq reference file')
parser.add_argument('--output_dir', type=str, required=True, help='Output directory for processed data')
parser.add_argument('--celltype_column', type=str, default="Annotation_merged", help='Column name for cell types in scRNA-seq data')
parser.add_argument('--n_workers', type=int, default=4, help='Number of workers for processing')
parser.add_argument('--k_tx', type=int, default=5, help='Number of neighbors for transcript graph')
parser.add_argument('--dist_tx', type=float, default=20.0, help='Distance threshold for transcript graph')
parser.add_argument('--subsample_frac', type=float, default=0.1, help='Subsampling fraction for scRNA-seq data')

args = parser.parse_args()

# Convert paths to Path objects
project_dir = Path(args.project_dir)
scrnaseq_file = Path(args.scrna_file)
output_dir = Path(args.output_dir)

# Load reference data and compute embeddings
scrnaseq = sc.read(scrnaseq_file)
sc.pp.subsample(scrnaseq, args.subsample_frac)
gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
scrnaseq,
args.celltype_column
)

# Process the sample
xenium_data_dir = project_dir / args.sample_id
segger_data_dir = output_dir / args.sample_id

try:
sample = STSampleParquet(
base_dir=xenium_data_dir,
n_workers=args.n_workers,
sample_type="xenium", # xenium for typical xenium, xenium_v2 for v2
weights=gene_celltype_abundance_embedding,
# scale_factor=0.5 # this is to shrink the initial seg. masks (used for seg. kit)
)

genes = list(set(scrnaseq.var_names) & set(sample.transcripts_metadata['feature_names']))
markers = find_markers(scrnaseq[:,genes], cell_type_column=args.celltype_column, pos_percentile=90, neg_percentile=20, percentage=20)
# Find mutually exclusive genes based on scRNAseq data
exclusive_gene_pairs = find_mutually_exclusive_genes(
adata=scrnaseq,
markers=markers,
cell_type_column=args.celltype_column
)

sample.save(
data_dir=segger_data_dir,
k_bd=3,
dist_bd=15,
k_tx=args.k_tx,
dist_tx=args.dist_tx,
k_tx_ex=20,
dist_tx_ex=20,
tile_size=10_000, # Tile size for processing
neg_sampling_ratio=5.0,
frac=1.0,
val_prob=0.3,
test_prob=0.0,
)
print(f"Successfully processed {args.sample_id}")
except Exception as e:
print(f"Failed to process {args.sample_id}: {str(e)}")
raise

if __name__ == "__main__":
main()
43 changes: 43 additions & 0 deletions scripts/batch_run_xenium/create_data_batch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

# Configuration - can be modified or overridden with command line arguments
PROJECT_DIR=${1:-"/omics/odcf/analysis/OE0606_projects/oncolgy_data_exchange/domenico_temp/xenium/xenium_output_files"} # folder including the raw xenium data folders
SCRNASEQ_FILE=${2:-"/omics/groups/OE0606/internal/tangy/tasks/brain_met_data/merged.Annotation_merged.h5ad"} # the scRNAseq atlas
CELLTYPE_COLUMN=${5:-"Annotation_merged"} # column pointing to the cell type annotation
OUTPUT_DIR=${3:-"logs"} # where to save the logs
SEGGER_DATA_DIR=${4:-"data_tidy/pyg_datasets/project24_MNG_final"} # where to save intermediate segge files (graphs and embeddings)


# Create output directory if it doesn't exist
mkdir -p $OUTPUT_DIR
mkdir -p $SEGGER_DATA_DIR

# Get list of samples to process (only XETG samples)
SAMPLES=($(ls $PROJECT_DIR | grep "^output-XETG"))

# Submit jobs for each sample
for SAMPLE in "${SAMPLES[@]}"; do
echo "Submitting job for sample: $SAMPLE"

# Create output directory for the sample
mkdir -p "${SEGGER_DATA_DIR}/${SAMPLE}"

# Submit with bsub
bsub -o ${OUTPUT_DIR}/preprocess_${SAMPLE}.log \
-e ${OUTPUT_DIR}/preprocess_${SAMPLE}.err \
-R "rusage[mem=32GB]" \
-n 5 \
-q medium \
python path/to/create_data_batch.py \
--sample_id "$SAMPLE" \
--project_dir "$PROJECT_DIR" \
--scrna_file "$SCRNASEQ_FILE" \
--output_dir "$SEGGER_DATA_DIR" \
--celltype_column "$CELLTYPE_COLUMN" \
--n_workers 5 \
--k_tx 5 \
--dist_tx 5.0 \
--subsample_frac 0.1
done

echo "All preprocessing jobs submitted"
43 changes: 43 additions & 0 deletions scripts/batch_run_xenium/create_data_batch_BrM.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

# Configuration - can be modified or overridden with command line arguments
PROJECT_DIR=${1:-"/omics/odcf/analysis/OE0606_projects/xenium_projects/temp/20250212_Xenium5k/20250212__133739__051_B450_Run01_2025-02-12/"} # folder including the raw xenium data folders
SCRNASEQ_FILE=${2:-"/omics/groups/OE0606/internal/amathiou/Projects/202403_BrainMets/snRNAseq/merged.Annotation_merged.h5ad"} # the scRNAseq atlas
CELLTYPE_COLUMN=${5:-"Annotation_merged"} # column pointing to the cell type annotation
OUTPUT_DIR=${3:-"logs"} # where to save the logs
SEGGER_DATA_DIR=${4:-"data_tidy/pyg_datasets/BrM"} # where to save intermediate segge files (graphs and embeddings)


# Create output directory if it doesn't exist
mkdir -p $OUTPUT_DIR
mkdir -p $SEGGER_DATA_DIR

# Get list of samples to process (only XETG samples)
SAMPLES=($(ls $PROJECT_DIR | grep "^output-XETG"))

# Submit jobs for each sample
for SAMPLE in "${SAMPLES[@]}"; do
echo "Submitting job for sample: $SAMPLE"

# Create output directory for the sample
mkdir -p "${SEGGER_DATA_DIR}/${SAMPLE}"

# Submit with bsub
bsub -o ${OUTPUT_DIR}/preprocess_${SAMPLE}.log \
-e ${OUTPUT_DIR}/preprocess_${SAMPLE}.err \
-R "rusage[mem=300GB]" \
-n 5 \
-q highmem-debian \
python ../segger_dev/scripts/batch_run_xenium/create_data_batch.py \
--sample_id "$SAMPLE" \
--project_dir "$PROJECT_DIR" \
--scrna_file "$SCRNASEQ_FILE" \
--output_dir "$SEGGER_DATA_DIR" \
--celltype_column "$CELLTYPE_COLUMN" \
--n_workers 5 \
--k_tx 5 \
--dist_tx 5.0 \
--subsample_frac 0.1
done

echo "All preprocessing jobs submitted"
43 changes: 43 additions & 0 deletions scripts/batch_run_xenium/create_data_batch_EwS.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

# Configuration - can be modified or overridden with command line arguments
PROJECT_DIR=${1:-"/omics/odcf/analysis/OE0606_projects/xenium_projects/2024_09_Ewing_Scarcoma_project_HL/tempfolder_migrated_20250519/raw/xenium/20250206__094540__029_B450_Run05_2025-02-06"} # folder including the raw xenium data folders
SCRNASEQ_FILE=${2:-"/omics/groups/OE0606/internal/hluo/data/EwingSarcoma/internal/EwS_kitz_sarc_4EwS_merged_preprocessed_no_normalization_with_annotation_from_seurat.h5ad"} # the scRNAseq atlas
CELLTYPE_COLUMN=${5:-"cell_type"} # column pointing to the cell type annotation
OUTPUT_DIR=${3:-"logs"} # where to save the logs
SEGGER_DATA_DIR=${4:-"data_tidy/pyg_datasets/EwS"} # where to save intermediate segge files (graphs and embeddings)


# Create output directory if it doesn't exist
mkdir -p $OUTPUT_DIR
mkdir -p $SEGGER_DATA_DIR

# Get list of samples to process (only XETG samples)
SAMPLES=($(ls $PROJECT_DIR | grep "^output-XETG"))

# Submit jobs for each sample
for SAMPLE in "${SAMPLES[@]}"; do
echo "Submitting job for sample: $SAMPLE"

# Create output directory for the sample
mkdir -p "${SEGGER_DATA_DIR}/${SAMPLE}"

# Submit with bsub
bsub -o ${OUTPUT_DIR}/preprocess_${SAMPLE}.log \
-e ${OUTPUT_DIR}/preprocess_${SAMPLE}.err \
-R "rusage[mem=300GB]" \
-n 5 \
-q highmem-debian \
python ../segger_dev/scripts/batch_run_xenium/create_data_batch.py \
--sample_id "$SAMPLE" \
--project_dir "$PROJECT_DIR" \
--scrna_file "$SCRNASEQ_FILE" \
--output_dir "$SEGGER_DATA_DIR" \
--celltype_column "$CELLTYPE_COLUMN" \
--n_workers 5 \
--k_tx 5 \
--dist_tx 5.0 \
--subsample_frac 0.1
done

echo "All preprocessing jobs submitted"
43 changes: 43 additions & 0 deletions scripts/batch_run_xenium/create_data_batch_GB.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

# Configuration - can be modified or overridden with command line arguments
PROJECT_DIR=${1:-"/omics/odcf/analysis/OE0606_projects/oncolgy_data_exchange/20230907-GB-Xenium-Vis-sn/Raw/Neuronal_Panel/20240822_GB_CytAssist_Run03"} # folder including the raw xenium data folders
SCRNASEQ_FILE=${2:-"/omics/odcf/analysis/OE0606_projects/oncolgy_data_exchange/20230907-GB-Xenium-Vis-sn/sn/GBmap_core.h5ad"} # the scRNAseq atlas
CELLTYPE_COLUMN=${5:-"cell_type"} # column pointing to the cell type annotation
OUTPUT_DIR=${3:-"logs"} # where to save the logs
SEGGER_DATA_DIR=${4:-"data_tidy/pyg_datasets/Neuronal_Panel"} # where to save intermediate segge files (graphs and embeddings)


# Create output directory if it doesn't exist
mkdir -p $OUTPUT_DIR
mkdir -p $SEGGER_DATA_DIR

# Get list of samples to process (only XETG samples)
SAMPLES=($(ls $PROJECT_DIR | grep "^output-XETG"))

# Submit jobs for each sample
for SAMPLE in "${SAMPLES[@]}"; do
echo "Submitting job for sample: $SAMPLE"

# Create output directory for the sample
mkdir -p "${SEGGER_DATA_DIR}/${SAMPLE}"

# Submit with bsub
bsub -o ${OUTPUT_DIR}/preprocess_${SAMPLE}.log \
-e ${OUTPUT_DIR}/preprocess_${SAMPLE}.err \
-R "rusage[mem=300GB]" \
-n 5 \
-q highmem-debian \
python ../segger_dev/scripts/batch_run_xenium/create_data_batch.py \
--sample_id "$SAMPLE" \
--project_dir "$PROJECT_DIR" \
--scrna_file "$SCRNASEQ_FILE" \
--output_dir "$SEGGER_DATA_DIR" \
--celltype_column "$CELLTYPE_COLUMN" \
--n_workers 5 \
--k_tx 5 \
--dist_tx 5.0 \
--subsample_frac 0.1
done

echo "All preprocessing jobs submitted"
Loading