fixes filtering the substrings as reported in #89

EliHei2 · EliHei2 · commit 37253a1d5578 · 2025-04-30T10:58:20.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -176,4 +176,7 @@ dev*
 
 # Custom
 *_old*
-.dev
+.dev
+
+scripts/*
+.scripts/*
diff --git a/docs/notebooks/segger_tutorial.ipynb b/docs/notebooks/segger_tutorial.ipynb
@@ -358,6 +358,29 @@
     "- **`--precision`**: Enables mixed precision training (e.g., `16-mixed`), which can speed up training and reduce memory usage while maintaining accuracy."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfff5dca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluate results\n",
+    "model_version = 0  # 'v_num' from training output above\n",
+    "model_path = Path('../human_CRC') / 'lightning_logs' / f'version_{model_version}'\n",
+    "metrics = pd.read_csv(model_path / 'metrics.csv', index_col=1)\n",
+    "\n",
+    "fig, ax = plt.subplots(1,1, figsize=(2,2))\n",
+    "\n",
+    "for col in metrics.columns.difference(['epoch']):\n",
+    "    metric = metrics[col].dropna()\n",
+    "    ax.plot(metric.index, metric.values, label=col)\n",
+    "\n",
+    "ax.legend(loc=(1, 0.33))\n",
+    "ax.set_ylim(0, 1)\n",
+    "ax.set_xlabel('Step')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "9a7d20c6-ca16-4beb-b627-afb41e3fb491",
@@ -461,6 +484,14 @@
     "Below is an example of how to run the faster Segger prediction pipeline using the command line:"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cdda303d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/scripts/create_data_cosmx.py b/scripts/create_data_cosmx.py
@@ -39,7 +39,7 @@
 
 
 XENIUM_DATA_DIR = Path("data_raw/cosmx/human_pancreas/processed/")
-SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/cosmx_pancreas_50")
+SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/cosmx_pancreas_fixed_")
 # SCRNASEQ_FILE = Path('/omics/groups/OE0606/internal/mimmo/Xenium/notebooks/data/scData/bh/bh_mng_scdata_20250306.h5ad')
 # CELLTYPE_COLUMN = 'annot_v1'
 
@@ -92,11 +92,11 @@
     data_dir=SEGGER_DATA_DIR,
     k_bd=3,  # Number of boundary points to connect
     dist_bd=15,  # Maximum distance for boundary connections
-    k_tx=20,  # Use calculated optimal transcript neighbors
+    k_tx=5,  # Use calculated optimal transcript neighbors
     dist_tx=70,  # Use calculated optimal search radius
-    tile_width=500,  # Tile size for processing
-    tile_height=500,
-    neg_sampling_ratio=5.0,  # 5:1 negative:positive samples
+    tile_width=1000,  # Tile size for processing,
+    tile_height=1000,  # Tile size for processing
+    neg_sampling_ratio=10.0,  # 5:1 negative:positive samples
     frac=1.0,  # Use all data
     val_prob=0.3,  # 30% validation set
     test_prob=0,  # No test set
diff --git a/scripts/create_data_fast_sample.py b/scripts/create_data_fast_sample.py
@@ -39,13 +39,13 @@
 
 
 XENIUM_DATA_DIR = Path(
-    "/omics/odcf/analysis/OE0606_projects_temp/oncolgy_data_exchange/analysis_domenico/project_24/output-XETG00423__0053177__mng_04_TMA__20250306__170821"
+    "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC"
 )
-SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/MNG_0053177")
-SCRNASEQ_FILE = Path(
-    "/omics/groups/OE0606/internal/mimmo/Xenium/notebooks/data/scData/bh/bh_mng_scdata_20250306.h5ad"
-)
-CELLTYPE_COLUMN = "annot_v1"
+SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_full")
+# SCRNASEQ_FILE = Path(
+#     "/omics/groups/OE0606/internal/mimmo/Xenium/notebooks/data/scData/bh/bh_mng_scdata_20250306.h5ad"
+# )
+# CELLTYPE_COLUMN = "annot_v1"
 
 # Calculate gene-celltype embeddings from reference data
 # gene_celltype_abundance_embedding = calculate_gene_celltype_abundance_embedding(
@@ -94,10 +94,10 @@
     data_dir=SEGGER_DATA_DIR,
     k_bd=3,  # Number of boundary points to connect
     dist_bd=15,  # Maximum distance for boundary connections
-    k_tx=k_tx,  # Use calculated optimal transcript neighbors
-    dist_tx=dist_tx,  # Use calculated optimal search radius
-    tile_width=100,  # Tile size for processing
-    tile_height=100,
+    k_tx=5,  # Use calculated optimal transcript neighbors
+    dist_tx=5,  # Use calculated optimal search radius
+    tile_width=200,  # Tile size for processing
+    tile_height=200,
     neg_sampling_ratio=5.0,  # 5:1 negative:positive samples
     frac=1.0,  # Use all data
     val_prob=0.3,  # 30% validation set
diff --git a/scripts/predict_model_sample.py b/scripts/predict_model_sample.py
@@ -10,25 +10,33 @@
 from pathlib import Path
 
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+os.environ["CUPY_CACHE_DIR"] = "./.cupy"
 import cupy as cp
 from dask.distributed import Client, LocalCluster
 from dask_cuda import LocalCUDACluster
 import dask.dataframe as dd
 
 
-seg_tag = "bc_rep1_emb_final"
-model_version = 6
+seg_tag = "human_CRC"
+model_version = 0
 
-seg_tag = "bc_fast_data_emb_major"
-model_version = 1
+seg_tag = "human_CRC"
+model_version = 0
+
+
+
+XENIUM_DATA_DIR = Path(
+    "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC"
+)
+SEGGER_DATA_DIR = Path("data_tidy/pyg_datasets/human_CRC_full")
 
 segger_data_dir = Path("data_tidy/pyg_datasets") / seg_tag
 models_dir = Path("./models") / seg_tag
 benchmarks_dir = Path(
-    "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/xe_rep1_bc"
+    "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_tidy/benchmarks/human_CRC"
 )
 transcripts_file = (
-    "data_raw/xenium/Xenium_FFPE_Human_Breast_Cancer_Rep1/transcripts.parquet"
+   "/dkfz/cluster/gpu/data/OE0606/elihei/segger_experiments/data_raw/xenium_seg_kit/human_CRC/transcripts.parquet"
 )
 # Initialize the Lightning data module
 dm = SeggerDataModule(
diff --git a/scripts/train_model_sample.py b/scripts/train_model_sample.py
@@ -1,5 +1,3 @@
-from segger.data.io import XeniumSample
-from segger.training.train import LitSegger
 from segger.training.segger_data_module import SeggerDataModule
 # from segger.prediction.predict import predict, load_model
 from segger.models.segger_model import Segger
@@ -19,8 +17,8 @@
 
 
 
-segger_data_dir = segger_data_dir = Path("data_tidy/pyg_datasets/cosmx_pancreas")
-models_dir = Path("./models/cosmx_pancreas")
+segger_data_dir = segger_data_dir = Path("data_tidy/pyg_datasets/human_CRC_full")
+models_dir = Path("./models/human_CRC")
 
 # Base directory to store Pytorch Lightning models
 # models_dir = Path('models')
@@ -46,7 +44,7 @@
 
 model = Segger(
     # is_token_based=is_token_based,
-    num_tx_tokens= 25000,
+    num_tx_tokens= 850,
     init_emb=8,
     hidden_channels=64,
     out_channels=16,
@@ -73,14 +71,14 @@
 
 # Initialize the Lightning trainer
 trainer = Trainer(
-    accelerator="cpu",
+    accelerator="gpu",
     strategy="auto",
     precision="16-mixed",
-    devices=2,  # set higher number if more gpus are available
-    max_epochs=400,
+    devices=4,  # set higher number if more gpus are available
+    max_epochs=100,
     default_root_dir=models_dir,
     logger=CSVLogger(models_dir),
 )
 
 
-trainer.fit(ls , datamodule=dm)
+trainer.fit(ls, datamodule=dm)
diff --git a/src/segger/data/parquet/_settings/xenium.yaml b/src/segger/data/parquet/_settings/xenium.yaml
@@ -4,7 +4,7 @@ transcripts:
   y: "y_location"
   z: "z_location"
   id: "transcript_id"
-  label: "target"
+  label: "feature_name"
   nuclear_column: "overlaps_nucleus"
   nuclear_value: 1
   qv_column: "qv"
diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py
@@ -14,9 +14,11 @@
 from itertools import compress
 from torch_geometric.data import HeteroData
 from torch_geometric.transforms import RandomLinkSplit
+from pqdm.threads import pqdm
 import torch
 import random
 from segger.data.parquet.transcript_embedding import TranscriptEmbedding
+# import re
 
 
 # TODO: Add documentation for settings
@@ -203,7 +205,8 @@ def transcripts_metadata(self) -> dict:
                 missing_genes = list(set(names_str) - set(self._emb_genes))
                 logging.warning(f"Number of missing genes: {len(missing_genes)}")
                 self.settings.transcripts.filter_substrings.extend(missing_genes)
-            pattern = "|".join(self.settings.transcripts.filter_substrings)
+            # pattern = "|".join(self.settings.transcripts.filter_substrings)
+            pattern = "|".join(f"^{s}" for s in self.settings.transcripts.filter_substrings)
             mask = pc.invert(pc.match_substring_regex(names, pattern))
             filtered_names = pc.filter(names, mask).to_pylist()
             metadata["feature_names"] = [
@@ -674,6 +677,7 @@ def _load_transcripts(self, path: os.PathLike, min_qv: float = 30.0):
         transcripts[self.settings.transcripts.label] = transcripts[
             self.settings.transcripts.label
         ].apply(lambda x: x.decode("utf-8") if isinstance(x, bytes) else x)
+        qv_column = getattr(self.settings.transcripts, "qv_column", None)
         transcripts = utils.filter_transcripts(
             transcripts,
             self.settings.transcripts.label,