Cleanup imports, remove reg param from Pf2, remove doublets when reading in mouse data

nbedanova · nbedanova · commit 35b15d825128 · 2025-06-30T17:55:08.000-07:00
diff --git a/pf2rnaseq/factorization.py b/pf2rnaseq/factorization.py
@@ -37,17 +37,11 @@ def pf2(
     random_state=1,
     doEmbedding: bool = True,
     tolerance=1e-9,
-    regParam=0.0,
     r2x=False,
 ):
     cupy.cuda.Device(1).use()
     pf_out, R2X = parafac2_nd(
-        X,
-        rank=rank,
-        random_state=random_state,
-        tol=tolerance,
-        n_iter_max=500,
-        l2=regParam,
+        X, rank=rank, random_state=random_state, tol=tolerance, n_iter_max=500
     )
 
     X = store_pf2(X, pf_out)
diff --git a/pf2rnaseq/imports.py b/pf2rnaseq/imports.py
@@ -1,15 +1,53 @@
-import glob
 from concurrent.futures import ProcessPoolExecutor
-from pathlib import Path
 
 import anndata
 import numpy as np
 import pandas as pd
 import scanpy as sc
-from scipy.sparse import csr_matrix, spmatrix
+from scipy.sparse import csr_array, spmatrix
+from sklearn.preprocessing import scale
 from sklearn.utils.sparsefuncs import inplace_column_scale, mean_variance_axis
 
 
+def prepare_dataset_deviance(
+    X: anndata.AnnData, condition_name, geneThreshold
+) -> anndata.AnnData:
+    X.X = csr_array(X.X)  # type: ignore
+    assert np.amin(X.X.data) >= 0.0
+    # Remove cells and genes with fewer than 10 reads
+    X = X[X.X.sum(axis=1) > 10, X.X.sum(axis=0) > 10]
+    readmean, _ = mean_variance_axis(X.X, axis=0)  # type: ignore
+    X = X[:, readmean > geneThreshold]
+    # Copy so that the subsetting is preserved
+    X._init_as_actual(X.copy())
+    # deviance transform
+    y_ij = X.X.toarray()  # type: ignore
+    # counts per cell
+    n_i = y_ij.sum(axis=1)
+    # MLE of gene expression
+    pi_j = y_ij.sum(axis=0) / np.sum(n_i)
+
+    non_y_ij = n_i[:, None] - y_ij
+    mu_ij = n_i[:, None] * pi_j[None, :]
+    signs = np.sign(y_ij - mu_ij)
+
+    first_term = 2 * y_ij * np.log(np.maximum(y_ij, 1.0) / mu_ij)
+    second_term = 2 * non_y_ij * np.log(non_y_ij / (n_i[:, None] - mu_ij))
+
+    X.X = signs * np.sqrt(np.maximum(first_term + second_term, 0.0))
+
+    X.X = scale(X.X)
+
+    _, sgIndex = np.unique(X.obs_vector(condition_name), return_inverse=True)
+    X.obs["condition_unique_idxs"] = sgIndex
+    X.obs["condition_unique_idxs"] = X.obs["condition_unique_idxs"].astype("category")
+    # Pre-calculate gene means
+    X.var["means"] = np.zeros(X.shape[1])
+
+    assert np.all(np.isfinite(X.X))  # type: ignore
+    return X
+
+
 def prepare_dataset(
     X: anndata.AnnData, condition_name: str, geneThreshold: float
 ) -> anndata.AnnData:
@@ -65,58 +103,6 @@ def import_citeseq() -> anndata.AnnData:
     return prepare_dataset(X, "Condition", geneThreshold=0.1)
 
 
-def import_HTAN() -> anndata.AnnData:
-    """Imports Vanderbilt's HTAN 10X data."""
-    files = glob.glob("/opt/extra-storage/HTAN/*.mtx.gz")
-    futures = []
-    data = {}
-
-    with ProcessPoolExecutor(max_workers=10) as executor:
-        for filename in files:
-            future = executor.submit(
-                sc.read_10x_mtx,
-                "/opt/extra-storage/HTAN/",
-                gex_only=False,
-                make_unique=True,
-                prefix=filename.split("/")[-1].split("matrix.")[0],
-            )
-            futures.append(future)
-
-        for i, k in enumerate(files):
-            result = futures[i].result()
-            data[k.split("/")[-1].split("_matrix.")[0]] = result
-
-    X = anndata.concat(data, merge="same", label="Condition")
-
-    return prepare_dataset(X, "Condition", geneThreshold=0.1)
-
-
-def import_CCLE() -> anndata.AnnData:
-    """Imports barcoded cell data."""
-    # TODO: Still need to add gene names and barcodes.
-    folder = "/opt/extra-storage/asm/Heiser-barcode/CCLE/"
-
-    adatas = {
-        "HCT116_1": anndata.read_text(
-            Path(folder + "HCT116_tracing_T1.count_mtx.tsv")
-        ).T,
-        "HCT116_2": anndata.read_text(
-            Path(folder + "HCT116_tracing_T2.count_mtx.tsv")
-        ).T,
-        "MDA-MB-231_1": anndata.read_text(
-            Path(folder + "MDA-MB-231_tracing_T1.count_mtx.tsv")
-        ).T,
-        "MDA-MB-231_2": anndata.read_text(
-            Path(folder + "MDA-MB-231_tracing_T2.count_mtx.tsv")
-        ).T,
-    }
-
-    X = anndata.concat(adatas, label="sample")
-    X.X = csr_matrix(X.X)
-
-    return prepare_dataset(X, "sample", geneThreshold=0.1)
-
-
 def import_cytokine() -> anndata.AnnData:
     """Import Meyer Cytokine PBMC dataset.
     -- columns from observation data:
@@ -140,25 +126,39 @@ def import_pf2Cytokine30() -> anndata.AnnData:
     return X
 
 
-def import_Heiser() -> anndata.AnnData:
+def import_Heiser(deviance=False) -> anndata.AnnData:
     """Import Heiser C3TAg dataset.
     anndata.X is the raw counts
 
     """
     data = anndata.read_h5ad("/home/nicoleb/C3TAg.h5ad")
+    if deviance:
+        # Apply deviance transformation
+        data = prepare_dataset_deviance(data, "sample_id", geneThreshold=0.1)
+    else:
+        # Apply standard normalization and scaling
+        data = prepare_dataset(data, "sample_id", geneThreshold=0.1)
 
-    return prepare_dataset(data, "sample_id", geneThreshold=0.01)
+    return prepare_dataset(data, "sample_id", geneThreshold=0.1)
 
 
 def import_MouseImmune() -> anndata.AnnData:
-    """Import cytokine data including gene expression and hashtag information.
-    Processes files with naming patterns like:
-    - GSM6102842_cytokine-samples07-barcodes.tsv.gz
-    - GSM6102885_cytokine-hashtags06-matrix.mtx.gz
-    """
+    """Import Mouse Immune Dictionary cytokine data.
+     -- columns from observation data:
+    {'biosample_id': cytokine and replicate info,
+    'rep': replicate,
+    'species': mouse species,
+    'cytokine_family': cytokine family label,
+    'cyt': cytokine mouse was treated with,
+    'sex': sex of mouse,
+    'celltype': cell type label,
+    'organ__ontology_label': organ label,
+    ...}"""
     X = anndata.read_h5ad("/home/nicoleb/MouseCytok.h5ad")
+    # Filter out doublets
+    X = X[X.obs["celltype"] != "doublet", :]
 
-    return prepare_dataset(X, "cyt", geneThreshold=0.1)  # 0.01
+    return prepare_dataset(X, "biosample_id", geneThreshold=0.1)  # 0.01
 
 
 def pseudobulk_lupus(X, cellType="Cell Type"):