|
| 1 | +import anndata as ad |
| 2 | +import numpy as np |
| 3 | +import logging |
| 4 | + |
| 5 | +logger = logging.getLogger(__name__) |
| 6 | +logging.basicConfig(level=logging.INFO) |
| 7 | + |
| 8 | +## VIASH START |
| 9 | +par = { |
| 10 | + "input": "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945_sc_counts_processed.h5ad", |
| 11 | + "var_feature_name": "index", |
| 12 | + "data_type": "sc", |
| 13 | + "donor_id": None, |
| 14 | + "cell_type": None, |
| 15 | + "perturbation": None, |
| 16 | + "dataset_id": "op3", |
| 17 | + "dataset_name": "OP3: single-cell multimodal dataset in PBMCs for perturbation prediction benchmarking", |
| 18 | + "dataset_summary": "The Open Problems Perurbation Prediction (OP3) dataset with small molecule perturbations in PBMCs", |
| 19 | + "dataset_description": "The OP3 dataset is to-date the largest single-cell small molecule perturbation dataset in primary tissue with multiple donor replicates.", |
| 20 | + "output": "output.h5ad", |
| 21 | + "output_compression": "gzip" |
| 22 | +} |
| 23 | +meta = {} |
| 24 | +## VIASH END |
| 25 | + |
| 26 | +def filter_op3_data(adata): |
| 27 | + """ |
| 28 | + Filter the OP3 dataset based on specific criteria for each small molecule and cell type. |
| 29 | + """ |
| 30 | + logger.info("Applying OP3-specific filtering criteria") |
| 31 | + |
| 32 | + # Original filter code follows... |
| 33 | + |
| 34 | + # Create a boolean mask for filtering observations |
| 35 | + obs_filt = np.ones(adata.n_obs, dtype=bool) |
| 36 | + |
| 37 | + # Alvocidib only T cells in only 2 donors, remove |
| 38 | + obs_filt = obs_filt & (adata.obs['sm_name'] != "Alvocidib") |
| 39 | + |
| 40 | + # BMS-387032 - one donor with only T cells, two other consistent, but only 2 cell types |
| 41 | + # Leave the 2 cell types in, remove donor 2 with only T cells |
| 42 | + obs_filt = obs_filt & ~((adata.obs['sm_name'] == "BMS-387032") & (adata.obs['donor_id'] == "Donor 2")) |
| 43 | + |
| 44 | + # BMS-387032 remove myeloid cells and B cells |
| 45 | + obs_filt = obs_filt & ~((adata.obs['sm_name'] == "BMS-387032") & |
| 46 | + adata.obs['cell_type'].isin(["B cells", "Myeloid cells"])) |
| 47 | + |
| 48 | + # CGP 60474 has only T cells left, remove |
| 49 | + obs_filt = obs_filt & (adata.obs['sm_name'] != "CGP 60474") |
| 50 | + |
| 51 | + # Canertinib - the variation of Myeloid cell proportions is very large, skip Myeloid |
| 52 | + obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Canertinib") & |
| 53 | + (adata.obs['cell_type'] == "Myeloid cells")) |
| 54 | + |
| 55 | + # Foretinib - large variation in Myeloid cell proportions (some in T cells), skip Myeloid |
| 56 | + obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Foretinib") & |
| 57 | + (adata.obs['cell_type'] == "Myeloid cells")) |
| 58 | + |
| 59 | + # Ganetespib (STA-9090) - donor 2 has no Myeloid and small NK cells proportions |
| 60 | + # Skip Myeloid, remove donor 2 |
| 61 | + obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Ganetespib (STA-9090)") & |
| 62 | + (adata.obs['donor_id'] == "Donor 2")) |
| 63 | + |
| 64 | + # IN1451 - donor 2 has no NK or B, remove Donor 2 |
| 65 | + obs_filt = obs_filt & ~((adata.obs['sm_name'] == "IN1451") & |
| 66 | + (adata.obs['donor_id'] == "Donor 2")) |
| 67 | + |
| 68 | + # Navitoclax - donor 3 doesn't have B cells and has different T and Myeloid proportions |
| 69 | + # Remove donor 3 |
| 70 | + obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Navitoclax") & |
| 71 | + (adata.obs['donor_id'] == "Donor 3")) |
| 72 | + |
| 73 | + # PF-04691502 remove Myeloid (only present in donor 3) |
| 74 | + obs_filt = obs_filt & ~((adata.obs['sm_name'] == "PF-04691502") & |
| 75 | + (adata.obs['cell_type'] == "Myeloid cells")) |
| 76 | + |
| 77 | + # Proscillaridin A;Proscillaridin-A remove Myeloid, since the variation is very high (4x) |
| 78 | + obs_filt = obs_filt & ~((adata.obs['sm_name'] == "Proscillaridin A;Proscillaridin-A") & |
| 79 | + (adata.obs['cell_type'] == "Myeloid cells")) |
| 80 | + |
| 81 | + # R428 - skip NK due to high variation (close to 3x) |
| 82 | + obs_filt = obs_filt & ~((adata.obs['sm_name'] == "R428") & |
| 83 | + (adata.obs['cell_type'] == "NK cells")) |
| 84 | + |
| 85 | + # UNII-BXU45ZH6LI - remove due to large variation across all cell types and missing cell types |
| 86 | + obs_filt = obs_filt & (adata.obs['sm_name'] != "UNII-BXU45ZH6LI") |
| 87 | + |
| 88 | + # Apply the filter |
| 89 | + filtered_adata = adata[obs_filt].copy() |
| 90 | + |
| 91 | + logger.info(f"Filtered data from {adata.n_obs} to {filtered_adata.n_obs} cells") |
| 92 | + |
| 93 | + return filtered_adata |
| 94 | + |
| 95 | +def fix_splits(adata): |
| 96 | + """Fix splits after reannotation.""" |
| 97 | + logger.info("Fix splits after reannotation") |
| 98 | + adata.obs["cell_type_orig_updated"] = adata.obs["cell_type_orig"].apply(lambda x: "T cells" if x.startswith("T ") else x) |
| 99 | + adata.obs["sm_cell_type_orig"] = adata.obs["sm_name"].astype(str) + "_" + adata.obs["cell_type_orig_updated"].astype(str) |
| 100 | + mapping_to_split = adata.obs.groupby("sm_cell_type_orig")["split"].apply(lambda x: x.unique()[0]).to_dict() |
| 101 | + adata.obs["sm_cell_type"] = adata.obs["sm_name"].astype(str) + "_" + adata.obs["cell_type"].astype(str) |
| 102 | + adata.obs["split"] = adata.obs["sm_cell_type"].map(mapping_to_split) |
| 103 | + adata.obs['control'] = adata.obs['split'].eq("control") |
| 104 | + return adata |
| 105 | + |
| 106 | + |
| 107 | +def move_x_to_layers(adata): |
| 108 | + """Move .X to .layers['counts'] and set X to None.""" |
| 109 | + logger.info("Moving .X to .layers['counts']") |
| 110 | + adata.layers["counts"] = adata.X.copy() |
| 111 | + adata.X = None |
| 112 | + |
| 113 | +def add_metadata_to_uns(adata, par): |
| 114 | + """Add standardized metadata to .uns.""" |
| 115 | + logger.info("Adding metadata to .uns") |
| 116 | + adata.uns['dataset_id'] = par["dataset_id"] |
| 117 | + adata.uns['dataset_name'] = par["dataset_name"] |
| 118 | + adata.uns['dataset_summary'] = par["dataset_summary"] |
| 119 | + adata.uns['dataset_description'] = par["dataset_description"] |
| 120 | + adata.uns['dataset_organism'] = 'Homo sapiens' |
| 121 | + adata.uns['dataset_url'] = par["dataset_url"] |
| 122 | + adata.uns['dataset_reference'] = 'GSE279945' |
| 123 | + adata.uns['dataset_version'] = '1.0.0' |
| 124 | + adata.uns['processing_status'] = 'processed' |
| 125 | + |
| 126 | +def print_unique(adata, column): |
| 127 | + """Print unique values in a column.""" |
| 128 | + if column in adata.obs: |
| 129 | + values = adata.obs[column].unique() |
| 130 | + if len(values) <= 10: |
| 131 | + formatted = "', '".join(values) |
| 132 | + logger.info(f"Unique {column}: ['{formatted}']") |
| 133 | + else: |
| 134 | + logger.info(f"Unique {column}: {len(values)} values") |
| 135 | + |
| 136 | +def print_summary(adata): |
| 137 | + """Print a summary of the dataset.""" |
| 138 | + logger.info(f"Dataset shape: {adata.shape}") |
| 139 | + logger.info(f"Number of cells: {adata.n_obs}") |
| 140 | + logger.info(f"Number of genes: {adata.n_vars}") |
| 141 | + |
| 142 | + # Print unique values for key columns |
| 143 | + for column in ['donor_id', 'cell_type', 'perturbation']: |
| 144 | + print_unique(adata, column) |
| 145 | + |
| 146 | + logger.info(f"Layers: {list(adata.layers.keys())}") |
| 147 | + logger.info(f"Metadata: {list(adata.uns.keys())}") |
| 148 | + |
| 149 | +def write_anndata(adata, par): |
| 150 | + """Write AnnData object to file.""" |
| 151 | + logger.info(f"Writing AnnData object to '{par['output']}'") |
| 152 | + adata.write_h5ad(par["output"], compression=par["output_compression"]) |
| 153 | + |
| 154 | +# Instead of defining main() and calling it at the end, write the code directly |
| 155 | +logger.info("Starting OP3 loader") |
| 156 | +# Load the data |
| 157 | +logger.info(f"Loading data at {par['input']}") |
| 158 | +adata = ad.read_h5ad(par["input"]) |
| 159 | + |
| 160 | +# Apply OP3-specific filtering |
| 161 | +adata = filter_op3_data(adata) |
| 162 | +# Filter by parameters |
| 163 | +if par["donor_id"] is not None: |
| 164 | + logger.info(f"Filtering for donor_id: {par['donor_id']}") |
| 165 | + adata = adata[adata.obs["donor_id"] == par["donor_id"]] |
| 166 | + |
| 167 | +if par["cell_type"] is not None: |
| 168 | + logger.info(f"Filtering for cell_type: {par['cell_type']}") |
| 169 | + adata = adata[adata.obs["cell_type"] == par["cell_type"]] |
| 170 | + |
| 171 | +if par["perturbation"] is not None: |
| 172 | + logger.info(f"Filtering for perturbation: {par['perturbation']}") |
| 173 | + adata = adata[adata.obs["perturbation"] == par["perturbation"]] |
| 174 | + |
| 175 | +# Fix splits after reannotation |
| 176 | +fix_splits(adata) |
| 177 | + |
| 178 | +# Move X to layers and normalize |
| 179 | +move_x_to_layers(adata) |
| 180 | + |
| 181 | +# Add dataset metadata |
| 182 | +add_metadata_to_uns(adata, par) |
| 183 | + |
| 184 | +# Add a feature name |
| 185 | +logger.info("Setting .var['feature_name']") |
| 186 | + |
| 187 | +if par["var_feature_name"] == "index": |
| 188 | + adata.var["feature_name"] = adata.var.index |
| 189 | +else: |
| 190 | + if par["var_feature_name"] in adata.var: |
| 191 | + adata.var["feature_name"] = adata.var[par["feature_name"]] |
| 192 | + del adata.var[par["feature_name"]] |
| 193 | + else: |
| 194 | + logger.info(f"Warning: key '{par['var_feature_name']}' could not be found in adata.var.") |
| 195 | + |
| 196 | +# Print summary and save |
| 197 | +print_summary(adata) |
| 198 | +write_anndata(adata, par) |
| 199 | + |
| 200 | +logger.info("Done") |
0 commit comments