EliHei2 · EliHei2 · Jul 24, 2025 · Aug 18, 2025 · Sep 29, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/scripts/0_data_creation_5k_nucleus.py b/scripts/0_data_creation_5k_nucleus.py
@@ -49,8 +49,8 @@
 
 
 # subsample the scRNAseq if needed
-# sc.pp.subsample(scrnaseq, 0.1)
-# scrnaseq.var_names_make_unique()
+sc.pp.subsample(scrnaseq, 0.1)
+scrnaseq.var_names_make_unique()
 
 
 # Calculate gene-celltype embeddings from reference data

diff --git a/scripts/1_train_5k.py b/scripts/1_train_5k.py
@@ -17,17 +17,17 @@
 
 
 
-segger_data_dir = Path("data_tidy/pyg_datasets/human_CRC_seg_cells")
-models_dir = Path("./models/human_CRC_seg_cells")
+segger_data_dir = Path("data_tidy/pyg_datasets/MNG_5k_sampled/output-XETG00078__0041719__Region_2__20241203__142052/")
+models_dir = Path("./models/MNG_5k_sampled/output-XETG00078__0041719__Region_2__20241203__142052/")
 
 # Base directory to store Pytorch Lightning models
 # models_dir = Path('models')
 
 # Initialize the Lightning data module
 dm = SeggerDataModule(
     data_dir=segger_data_dir,
-    batch_size=2,
-    num_workers=2,
+    batch_size=3,
+    num_workers=3,
 )
 
 dm.setup()
@@ -43,6 +43,7 @@
 
 
 model = Segger(
+    # is_token_based=is_token_based,s
     num_tx_tokens= num_tx_tokens,
     init_emb=8,
     hidden_channels=32,
@@ -64,7 +65,7 @@
     strategy="auto",
     precision="32",
     devices=4,  # set higher number if more gpus are available
-    max_epochs=150,
+    max_epochs=250,
     default_root_dir=models_dir,
     logger=CSVLogger(models_dir),
 )

diff --git a/scripts/4_xenium_explorer.py b/scripts/4_xenium_explorer.py
@@ -0,0 +1,41 @@
+
+
+from segger.validation.xenium_explorer import seg2explorer
+import pandas as pd
+
+
+transcripts_file = 'data_tidy/benchmarks/human_CRC_seg_nuclei/human_CRC_seg_nuclei_0.4_False_4_15_5_3_20250521/segger_transcripts.parquet'
+transcripts_df = pd.read_parquet(transcripts_file)
+# transcripts_df = transcripts_df.iloc[:10000]
+seg2explorer(
+    seg_df=transcripts_df, # this is your segger output
+    source_path="/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real", #raw xenium data
+    output_dir="data_tidy/explorer/human_CRC_real_nuclei", #where you wanna save your xneium explorer file, could be the same as raw
+    cells_filename="seg_cells1", #file names for cells.zarr
+    analysis_filename="seg_analysis1", #file names for analysis.zarr
+    xenium_filename="seg_experiment1.xenium", #xenium explorer file
+    analysis_df=None,
+    cell_id_columns="segger_cell_id", # segger cell id column in transcripts_df
+)
+
+
+
+XENIUM_DATA_DIR = Path( #raw data dir
+    "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC_real"
+)
+transcripts_file = (
+   XENIUM_DATA_DIR / "transcripts.parquet"
+)
+
+SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_seg_nuclei") # preprocessed data dir
+
+
+seg_tag = "human_CRC_seg_nuclei"
+model_version = 0
+models_dir = Path("./models") / seg_tag #trained model dir
+
+
+output_dir = Path( #output dir
+    "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/human_CRC_seg_nuclei"
+)
+
diff --git a/scripts/batch_run_xenium.zip b/scripts/batch_run_xenium.zip
diff --git a/scripts/batch_run_xenium/create_data_batch.py b/scripts/batch_run_xenium/create_data_batch.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+from pathlib import Path
+from segger.data.parquet.sample import STSampleParquet
+from segger.data.utils import calculate_gene_celltype_abundance_embedding
+import scanpy as sc
+import pandas as pd
+import numpy as np
+import argparse
+import os
+from pqdm.processes import pqdm
+from tqdm import tqdm
+from segger.data.parquet._utils import find_markers, find_mutually_exclusive_genes
+
+def main():
+    # Set up argument parser
+    parser = argparse.ArgumentParser(description='Preprocess Xenium sample for segger')
+    parser.add_argument('--sample_id', type=str, required=True, help='Xenium sample ID to process')
+    parser.add_argument('--project_dir', type=str, required=True, help='Base directory containing Xenium samples')
+    parser.add_argument('--scrna_file', type=str, required=True, help='Path to scRNA-seq reference file')
+    parser.add_argument('--output_dir', type=str, required=True, help='Output directory for processed data')
+    parser.add_argument('--celltype_column', type=str, default="Annotation_merged", help='Column name for cell types in scRNA-seq data')
+    parser.add_argument('--n_workers', type=int, default=4, help='Number of workers for processing')
+    parser.add_argument('--k_tx', type=int, default=5, help='Number of neighbors for transcript graph')
+    parser.add_argument('--dist_tx', type=float, default=20.0, help='Distance threshold for transcript graph')
+    parser.add_argument('--subsample_frac', type=float, default=0.1, help='Subsampling fraction for scRNA-seq data')
+
+    args = parser.parse_args()
+
+    # Convert paths to Path objects
+    project_dir = Path(args.project_dir)
+    scrnaseq_file = Path(args.scrna_file)
+    output_dir = Path(args.output_dir)
+
+    # Load reference data and compute embeddings
+    scrnaseq = sc.read(scrnaseq_file)
+    sc.pp.subsample(scrnaseq, args.subsample_frac)
+    gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
+        scrnaseq,
+        args.celltype_column
+    )
+
+    # Process the sample
+    xenium_data_dir = project_dir / args.sample_id
+    segger_data_dir = output_dir / args.sample_id
+
+    try:
+        sample = STSampleParquet(
+            base_dir=xenium_data_dir,
+            n_workers=args.n_workers,
+            sample_type="xenium", # xenium for typical xenium, xenium_v2 for v2
+            weights=gene_celltype_abundance_embedding,
+            # scale_factor=0.5 # this is to shrink the initial seg. masks (used for seg. kit)
+        )
+
+        genes = list(set(scrnaseq.var_names) & set(sample.transcripts_metadata['feature_names']))
+        markers = find_markers(scrnaseq[:,genes], cell_type_column=args.celltype_column, pos_percentile=90, neg_percentile=20, percentage=20)
+        # Find mutually exclusive genes based on scRNAseq data
+        exclusive_gene_pairs = find_mutually_exclusive_genes(
+            adata=scrnaseq,
+            markers=markers,
+            cell_type_column=args.celltype_column
+        )
+
+        sample.save(
+            data_dir=segger_data_dir,
+            k_bd=3,
+            dist_bd=15,
+            k_tx=args.k_tx,
+            dist_tx=args.dist_tx,
+            k_tx_ex=20,
+            dist_tx_ex=20,
+            tile_size=10_000,  # Tile size for processing
+            neg_sampling_ratio=5.0,
+            frac=1.0,
+            val_prob=0.3,
+            test_prob=0.0,
+        )
+        print(f"Successfully processed {args.sample_id}")
+    except Exception as e:
+        print(f"Failed to process {args.sample_id}: {str(e)}")
+        raise
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/batch_run_xenium/create_data_batch.sh b/scripts/batch_run_xenium/create_data_batch.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# Configuration - can be modified or overridden with command line arguments
+PROJECT_DIR=${1:-"/omics/odcf/analysis/OE0606_projects/oncolgy_data_exchange/domenico_temp/xenium/xenium_output_files"} # folder including the raw xenium data folders
+SCRNASEQ_FILE=${2:-"/omics/groups/OE0606/internal/tangy/tasks/brain_met_data/merged.Annotation_merged.h5ad"} # the scRNAseq atlas
+CELLTYPE_COLUMN=${5:-"Annotation_merged"} # column pointing to the cell type annotation
+OUTPUT_DIR=${3:-"logs"} # where to save the logs
+SEGGER_DATA_DIR=${4:-"data_tidy/pyg_datasets/project24_MNG_final"} # where to save intermediate segge files (graphs and embeddings)
+
+
+# Create output directory if it doesn't exist
+mkdir -p $OUTPUT_DIR
+mkdir -p $SEGGER_DATA_DIR
+
+# Get list of samples to process (only XETG samples)
+SAMPLES=($(ls $PROJECT_DIR | grep "^output-XETG"))
+
+# Submit jobs for each sample
+for SAMPLE in "${SAMPLES[@]}"; do
+    echo "Submitting job for sample: $SAMPLE"
+
+    # Create output directory for the sample
+    mkdir -p "${SEGGER_DATA_DIR}/${SAMPLE}"
+
+    # Submit with bsub
+    bsub -o ${OUTPUT_DIR}/preprocess_${SAMPLE}.log \
+         -e ${OUTPUT_DIR}/preprocess_${SAMPLE}.err \
+         -R "rusage[mem=32GB]" \
+         -n 5 \
+         -q medium \
+         python path/to/create_data_batch.py \
+            --sample_id "$SAMPLE" \
+            --project_dir "$PROJECT_DIR" \
+            --scrna_file "$SCRNASEQ_FILE" \
+            --output_dir "$SEGGER_DATA_DIR" \
+            --celltype_column "$CELLTYPE_COLUMN" \
+            --n_workers 5 \
+            --k_tx 5 \
+            --dist_tx 5.0 \
+            --subsample_frac 0.1
+done
+
+echo "All preprocessing jobs submitted"
diff --git a/scripts/batch_run_xenium/create_data_batch_BrM.sh b/scripts/batch_run_xenium/create_data_batch_BrM.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# Configuration - can be modified or overridden with command line arguments
+PROJECT_DIR=${1:-"/omics/odcf/analysis/OE0606_projects/xenium_projects/temp/20250212_Xenium5k/20250212__133739__051_B450_Run01_2025-02-12/"} # folder including the raw xenium data folders
+SCRNASEQ_FILE=${2:-"/omics/groups/OE0606/internal/amathiou/Projects/202403_BrainMets/snRNAseq/merged.Annotation_merged.h5ad"} # the scRNAseq atlas
+CELLTYPE_COLUMN=${5:-"Annotation_merged"} # column pointing to the cell type annotation
+OUTPUT_DIR=${3:-"logs"} # where to save the logs
+SEGGER_DATA_DIR=${4:-"data_tidy/pyg_datasets/BrM"} # where to save intermediate segge files (graphs and embeddings)
+
+
+# Create output directory if it doesn't exist
+mkdir -p $OUTPUT_DIR
+mkdir -p $SEGGER_DATA_DIR
+
+# Get list of samples to process (only XETG samples)
+SAMPLES=($(ls $PROJECT_DIR | grep "^output-XETG"))
+
+# Submit jobs for each sample
+for SAMPLE in "${SAMPLES[@]}"; do
+    echo "Submitting job for sample: $SAMPLE"
+
+    # Create output directory for the sample
+    mkdir -p "${SEGGER_DATA_DIR}/${SAMPLE}"
+
+    # Submit with bsub
+    bsub -o ${OUTPUT_DIR}/preprocess_${SAMPLE}.log \
+         -e ${OUTPUT_DIR}/preprocess_${SAMPLE}.err \
+         -R "rusage[mem=300GB]" \
+         -n 5 \
+         -q highmem-debian \
+         python ../segger_dev/scripts/batch_run_xenium/create_data_batch.py \
+            --sample_id "$SAMPLE" \
+            --project_dir "$PROJECT_DIR" \
+            --scrna_file "$SCRNASEQ_FILE" \
+            --output_dir "$SEGGER_DATA_DIR" \
+            --celltype_column "$CELLTYPE_COLUMN" \
+            --n_workers 5 \
+            --k_tx 5 \
+            --dist_tx 5.0 \
+            --subsample_frac 0.1
+done
+
+echo "All preprocessing jobs submitted"
diff --git a/scripts/batch_run_xenium/create_data_batch_EwS.sh b/scripts/batch_run_xenium/create_data_batch_EwS.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# Configuration - can be modified or overridden with command line arguments
+PROJECT_DIR=${1:-"/omics/odcf/analysis/OE0606_projects/xenium_projects/2024_09_Ewing_Scarcoma_project_HL/tempfolder_migrated_20250519/raw/xenium/20250206__094540__029_B450_Run05_2025-02-06"} # folder including the raw xenium data folders
+SCRNASEQ_FILE=${2:-"/omics/groups/OE0606/internal/hluo/data/EwingSarcoma/internal/EwS_kitz_sarc_4EwS_merged_preprocessed_no_normalization_with_annotation_from_seurat.h5ad"} # the scRNAseq atlas
+CELLTYPE_COLUMN=${5:-"cell_type"} # column pointing to the cell type annotation
+OUTPUT_DIR=${3:-"logs"} # where to save the logs
+SEGGER_DATA_DIR=${4:-"data_tidy/pyg_datasets/EwS"} # where to save intermediate segge files (graphs and embeddings)
+
+
+# Create output directory if it doesn't exist
+mkdir -p $OUTPUT_DIR
+mkdir -p $SEGGER_DATA_DIR
+
+# Get list of samples to process (only XETG samples)
+SAMPLES=($(ls $PROJECT_DIR | grep "^output-XETG"))
+
+# Submit jobs for each sample
+for SAMPLE in "${SAMPLES[@]}"; do
+    echo "Submitting job for sample: $SAMPLE"
+
+    # Create output directory for the sample
+    mkdir -p "${SEGGER_DATA_DIR}/${SAMPLE}"
+
+    # Submit with bsub
+    bsub -o ${OUTPUT_DIR}/preprocess_${SAMPLE}.log \
+         -e ${OUTPUT_DIR}/preprocess_${SAMPLE}.err \
+         -R "rusage[mem=300GB]" \
+         -n 5 \
+         -q highmem-debian \
+         python ../segger_dev/scripts/batch_run_xenium/create_data_batch.py \
+            --sample_id "$SAMPLE" \
+            --project_dir "$PROJECT_DIR" \
+            --scrna_file "$SCRNASEQ_FILE" \
+            --output_dir "$SEGGER_DATA_DIR" \
+            --celltype_column "$CELLTYPE_COLUMN" \
+            --n_workers 5 \
+            --k_tx 5 \
+            --dist_tx 5.0 \
+            --subsample_frac 0.1
+done
+
+echo "All preprocessing jobs submitted"
diff --git a/scripts/batch_run_xenium/create_data_batch_GB.sh b/scripts/batch_run_xenium/create_data_batch_GB.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# Configuration - can be modified or overridden with command line arguments
+PROJECT_DIR=${1:-"/omics/odcf/analysis/OE0606_projects/oncolgy_data_exchange/20230907-GB-Xenium-Vis-sn/Raw/Neuronal_Panel/20240822_GB_CytAssist_Run03"} # folder including the raw xenium data folders
+SCRNASEQ_FILE=${2:-"/omics/odcf/analysis/OE0606_projects/oncolgy_data_exchange/20230907-GB-Xenium-Vis-sn/sn/GBmap_core.h5ad"} # the scRNAseq atlas
+CELLTYPE_COLUMN=${5:-"cell_type"} # column pointing to the cell type annotation
+OUTPUT_DIR=${3:-"logs"} # where to save the logs
+SEGGER_DATA_DIR=${4:-"data_tidy/pyg_datasets/Neuronal_Panel"} # where to save intermediate segge files (graphs and embeddings)
+
+
+# Create output directory if it doesn't exist
+mkdir -p $OUTPUT_DIR
+mkdir -p $SEGGER_DATA_DIR
+
+# Get list of samples to process (only XETG samples)
+SAMPLES=($(ls $PROJECT_DIR | grep "^output-XETG"))
+
+# Submit jobs for each sample
+for SAMPLE in "${SAMPLES[@]}"; do
+    echo "Submitting job for sample: $SAMPLE"
+
+    # Create output directory for the sample
+    mkdir -p "${SEGGER_DATA_DIR}/${SAMPLE}"
+
+    # Submit with bsub
+    bsub -o ${OUTPUT_DIR}/preprocess_${SAMPLE}.log \
+         -e ${OUTPUT_DIR}/preprocess_${SAMPLE}.err \
+         -R "rusage[mem=300GB]" \
+         -n 5 \
+         -q highmem-debian \
+         python ../segger_dev/scripts/batch_run_xenium/create_data_batch.py \
+            --sample_id "$SAMPLE" \
+            --project_dir "$PROJECT_DIR" \
+            --scrna_file "$SCRNASEQ_FILE" \
+            --output_dir "$SEGGER_DATA_DIR" \
+            --celltype_column "$CELLTYPE_COLUMN" \
+            --n_workers 5 \
+            --k_tx 5 \
+            --dist_tx 5.0 \
+            --subsample_frac 0.1
+done
+
+echo "All preprocessing jobs submitted"