Merge pull request #24 from openproblems-bio/add_resolvi

LouisK92 · web-flow · commit 56ed6b778d33 · 2025-03-18T15:50:39.000+01:00
Add resolvi
diff --git a/src/methods_expression_correction/resolvi_correction/config.vsh.yaml b/src/methods_expression_correction/resolvi_correction/config.vsh.yaml
@@ -0,0 +1,57 @@
+__merge__: /src/api/comp_method_expression_correction.yaml
+
+name: resolvi_correction
+label: "resolVI Correction"
+summary: "Corrects the expression of genes using resolVI"
+description: >-
+  Corrects the expression of genes based on the resolVI method, a part of scvi-tools.
+links:
+  documentation: "https://docs.scvi-tools.org/en/latest/user_guide/models/resolvi.html"
+  repository: "https://github.com/scverse/scvi-tools"
+references:
+  doi: "10.1101/2025.01.20.634005"
+
+arguments:
+  - name: --celltype_key
+    required: false
+    direction: input
+    type: string
+    default: cell_type
+
+  - name: --n_hidden
+    required: false
+    direction: input
+    type: integer
+    default: 32
+
+  - name: --encode_covariates
+    required: false
+    direction: input
+    type: boolean
+    default: false
+
+  - name: --downsample_counts
+    required: false
+    direction: input
+    type: boolean
+    default: true
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    __merge__: 
+      - /src/base/setup_txsim_partial.yaml
+    setup:
+        - type: python
+          pypi: [scvi-tools]
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [ midtime, highcpu, highmem ]
diff --git a/src/methods_expression_correction/resolvi_correction/script.py b/src/methods_expression_correction/resolvi_correction/script.py
@@ -0,0 +1,82 @@
+import anndata as ad
+import txsim as tx
+import scvi
+import pandas as pd
+import scanpy as sc
+import scipy
+import numpy as np
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  'input_spatial_with_cell_types': 'resources_test/task_ist_preprocessing/mouse_brain_combined/spatial_with_cell_types.h5ad',
+  'celltype_key': 'cell_type',
+  'output': '../resolvi_spatial_corrected.h5ad',
+  'n_hidden': 32,
+  'encode_covariates': False,
+  'downsample_counts': True
+}
+meta = {
+  'name': 'resolvi_correction',
+}
+## VIASH END
+
+# NOTE/TODO: for grid search:
+# - n_hidden: 32 (default), 64, 128
+# - encode_covariates: False(default)/True
+# - downsample_counts: True(default)/False
+
+# Optional parameter check: For this specific correction method the par['input_sc'] is required
+   
+# Read input
+print('Reading input files', flush=True)
+adata_sp = ad.read_h5ad(par['input_spatial_with_cell_types'])
+adata_sp.layers["normalized_uncorrected"] = adata_sp.layers["normalized"]
+
+print("Filter cells with <5 counts")
+sc.pp.filter_cells(adata_sp, min_genes=5)
+
+spatial_array = np.stack([adata_sp.obs['centroid_x'].values, adata_sp.obs['centroid_y'].values], axis=1)
+adata_sp.obsm['X_spatial'] = spatial_array
+
+# Apply gene efficiency correction
+print('Running ResolVI', flush=True)
+
+scvi.external.RESOLVI.setup_anndata(adata_sp, labels_key=par['celltype_key'], layer="counts")
+
+supervised_resolvi = scvi.external.RESOLVI(adata_sp, semisupervised=True, 
+  n_hidden = par['n_hidden'], 
+  encode_covariates = par['encode_covariates'], 
+  downsample_counts = par['downsample_counts'])
+supervised_resolvi.train(max_epochs=50)
+
+samples_corr = supervised_resolvi.sample_posterior(
+        model=supervised_resolvi.module.model_corrected,
+        return_sites=['px_rate'],
+        summary_fun={"post_sample_q50": np.median},
+        num_samples=20, return_samples=False, batch_size=4000) #batch_steps was not a parameter
+samples_corr = pd.DataFrame(samples_corr).T
+
+samples = supervised_resolvi.sample_posterior(
+    model=supervised_resolvi.module.model_residuals,
+    return_sites=[
+        'mixture_proportions', 'mean_poisson', 'per_gene_background', 
+        'diffusion_mixture_proportion', 'per_neighbor_diffusion', 'px_r_inv'
+        ],
+    num_samples=20, return_samples=False, batch_size=4000)
+samples = pd.DataFrame(samples).T
+
+
+adata_sp.obsm["X_resolVI"] = supervised_resolvi.get_latent_representation()
+
+# TODO these 2 lines threw errors because 'obs' was not generated in samples_corr
+# adata_sp.layers["generated_expression"] = scipy.sparse.csr_matrix(samples_corr.loc['post_sample_q25', 'obs'])
+# adata_sp.layers["generated_expression_mean"] = scipy.sparse.csr_matrix(samples_corr.loc['post_sample_means', 'obs'])
+
+adata_sp.layers["corrected_counts"] = adata_sp.layers['counts'].multiply((samples_corr.loc['post_sample_q50', 'px_rate'] / (
+    1.0 + samples_corr.loc['post_sample_q50', 'px_rate'] + samples.loc['post_sample_means', 'mean_poisson']))).tocsr()
+
+# Write output
+print('Writing output', flush=True)
+adata_sp.write(par['output'])