Skip to content

Commit e6db454

Browse files
Added CPJUMP1 compound MoA metadata (#59)
* updated cpjump1 plates selection to only compound treated plates * updated download module; removed functions * fix metadata naming bug * updated nb configs * updated download module * updated control subsetting notebook * updated preprocessing module and reran pre-commit * reran notebooke #3 in module 0 * added moa infromation to cpjump compound data * added moas * updated * Update notebooks/0.download-data/nbconverted/2.preprocessing.py Co-authored-by: Jenna Tomkinson <107513215+jenna-tomkinson@users.noreply.github.com> --------- Co-authored-by: Jenna Tomkinson <107513215+jenna-tomkinson@users.noreply.github.com>
1 parent 7d4caf8 commit e6db454

File tree

3 files changed

+108
-38
lines changed

3 files changed

+108
-38
lines changed

notebooks/0.download-data/2.preprocessing.ipynb

Lines changed: 57 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@
3636
"import polars as pl\n",
3737
"\n",
3838
"sys.path.append(\"../../\")\n",
39-
"from utils.data_utils import split_meta_and_features, add_cell_id_hash"
39+
"from utils.data_utils import split_meta_and_features, add_cell_id_hash\n",
40+
"from utils.io_utils import load_profiles"
4041
]
4142
},
4243
{
@@ -93,15 +94,6 @@
9394
" \"All elements in specific_plates must be pathlib.Path objects\"\n",
9495
" )\n",
9596
"\n",
96-
" def load_profile(file: pathlib.Path) -> pl.DataFrame:\n",
97-
" \"\"\"internal function to load a single profile file.\"\"\"\n",
98-
" profile_df = pl.read_parquet(file)\n",
99-
" meta_cols, _ = split_meta_and_features(profile_df)\n",
100-
" if shared_features is not None:\n",
101-
" # Only select metadata and shared features\n",
102-
" return profile_df.select(meta_cols + shared_features)\n",
103-
" return profile_df\n",
104-
"\n",
10597
" # Use specific_plates if provided, otherwise gather all .parquet files\n",
10698
" if specific_plates is not None:\n",
10799
" # Validate that all specific plate files exist\n",
@@ -115,7 +107,9 @@
115107
" raise FileNotFoundError(f\"No profile files found in {profile_dir}\")\n",
116108
"\n",
117109
" # Load and concatenate profiles\n",
118-
" loaded_profiles = [load_profile(f) for f in files_to_load]\n",
110+
" loaded_profiles = [\n",
111+
" load_profiles(f, shared_features=shared_features) for f in files_to_load\n",
112+
" ]\n",
119113
"\n",
120114
" # Concatenate all loaded profiles\n",
121115
" return pl.concat(loaded_profiles, rechunk=True)\n",
@@ -205,6 +199,11 @@
205199
"# Setting profiles directory\n",
206200
"profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n",
207201
"\n",
202+
"# setting connectivity map drug repurposing config\n",
203+
"drug_repurposing_config_path = (data_dir / \"repurposing_drugs_20180907.txt\").resolve(\n",
204+
" strict=True\n",
205+
")\n",
206+
"\n",
208207
"# Experimental metadata\n",
209208
"exp_metadata_path = (\n",
210209
" profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n",
@@ -286,6 +285,14 @@
286285
"- Adding a unique cell id has column `Metadata_cell_id`"
287286
]
288287
},
288+
{
289+
"cell_type": "markdown",
290+
"id": "9ec882fa",
291+
"metadata": {},
292+
"source": [
293+
"We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis."
294+
]
295+
},
289296
{
290297
"cell_type": "code",
291298
"execution_count": 5,
@@ -306,12 +313,38 @@
306313
")\n",
307314
"\n",
308315
"# create an index columm and unique cell ID based on features of a single profiles\n",
309-
"cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)\n",
316+
"cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)"
317+
]
318+
},
319+
{
320+
"cell_type": "markdown",
321+
"id": "3df9bbf5",
322+
"metadata": {},
323+
"source": [
324+
"Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n"
325+
]
326+
},
327+
{
328+
"cell_type": "code",
329+
"execution_count": 6,
330+
"id": "adfb9148",
331+
"metadata": {},
332+
"outputs": [],
333+
"source": [
334+
"# load drug repurposing moa file and add prefix to metadata columns\n",
335+
"rep_moa_df = pl.read_csv(\n",
336+
" drug_repurposing_config_path, separator=\"\\t\", skip_rows=9, encoding=\"utf8-lossy\"\n",
337+
").rename(lambda x: f\"Metadata_{x}\" if not x.startswith(\"Metadata_\") else x)\n",
338+
"\n",
339+
"# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
340+
"cpjump1_profiles = cpjump1_profiles.join(\n",
341+
" rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
342+
")\n",
310343
"\n",
311-
"# Split meta and features\n",
344+
"# split meta and feature\n",
312345
"meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n",
313346
"\n",
314-
"# Saving metadata and features of the concat profile into a json file\n",
347+
"# save the feature space information into a json file\n",
315348
"meta_features_dict = {\n",
316349
" \"concat-profiles\": {\n",
317350
" \"meta-features\": meta_cols,\n",
@@ -321,7 +354,11 @@
321354
"with open(cpjump1_output_dir / \"concat_profiles_meta_features.json\", \"w\") as f:\n",
322355
" json.dump(meta_features_dict, f, indent=4)\n",
323356
"\n",
324-
"# save as parquet with defined order of columns\n",
357+
"# save concatenated profiles\n",
358+
"# Loading compound profiles with shared features and concat into a single DataFrame\n",
359+
"concat_output_path = (\n",
360+
" cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n",
361+
").resolve()\n",
325362
"cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)"
326363
]
327364
},
@@ -350,7 +387,7 @@
350387
},
351388
{
352389
"cell_type": "code",
353-
"execution_count": 6,
390+
"execution_count": 7,
354391
"id": "c5471d3e",
355392
"metadata": {},
356393
"outputs": [],
@@ -404,7 +441,7 @@
404441
},
405442
{
406443
"cell_type": "code",
407-
"execution_count": 7,
444+
"execution_count": 8,
408445
"id": "c57da947",
409446
"metadata": {},
410447
"outputs": [],
@@ -437,7 +474,7 @@
437474
},
438475
{
439476
"cell_type": "code",
440-
"execution_count": 8,
477+
"execution_count": 9,
441478
"id": "1d7ced04",
442479
"metadata": {},
443480
"outputs": [],
@@ -490,7 +527,7 @@
490527
},
491528
{
492529
"cell_type": "code",
493-
"execution_count": 9,
530+
"execution_count": 10,
494531
"id": "42108980",
495532
"metadata": {},
496533
"outputs": [],
@@ -537,7 +574,7 @@
537574
},
538575
{
539576
"cell_type": "code",
540-
"execution_count": 10,
577+
"execution_count": 11,
541578
"id": "1763d383",
542579
"metadata": {},
543580
"outputs": [],

notebooks/0.download-data/nbconverted/2.preprocessing.py

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
sys.path.append("../../")
2828
from utils.data_utils import add_cell_id_hash, split_meta_and_features
29+
from utils.io_utils import load_profiles
2930

3031
# ## Helper functions
3132
#
@@ -71,15 +72,6 @@ def load_and_concat_profiles(
7172
"All elements in specific_plates must be pathlib.Path objects"
7273
)
7374

74-
def load_profile(file: pathlib.Path) -> pl.DataFrame:
75-
"""internal function to load a single profile file."""
76-
profile_df = pl.read_parquet(file)
77-
meta_cols, _ = split_meta_and_features(profile_df)
78-
if shared_features is not None:
79-
# Only select metadata and shared features
80-
return profile_df.select(meta_cols + shared_features)
81-
return profile_df
82-
8375
# Use specific_plates if provided, otherwise gather all .parquet files
8476
if specific_plates is not None:
8577
# Validate that all specific plate files exist
@@ -93,7 +85,9 @@ def load_profile(file: pathlib.Path) -> pl.DataFrame:
9385
raise FileNotFoundError(f"No profile files found in {profile_dir}")
9486

9587
# Load and concatenate profiles
96-
loaded_profiles = [load_profile(f) for f in files_to_load]
88+
loaded_profiles = [
89+
load_profiles(f, shared_features=shared_features) for f in files_to_load
90+
]
9791

9892
# Concatenate all loaded profiles
9993
return pl.concat(loaded_profiles, rechunk=True)
@@ -173,6 +167,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
173167
# Setting profiles directory
174168
profiles_dir = (data_dir / "sc-profiles").resolve(strict=True)
175169

170+
# setting connectivity map drug repurposing config
171+
drug_repurposing_config_path = (data_dir / "repurposing_drugs_20180907.txt").resolve(
172+
strict=True
173+
)
174+
176175
# Experimental metadata
177176
exp_metadata_path = (
178177
profiles_dir / "cpjump1" / "cpjump1_compound_experimental-metadata.csv"
@@ -238,6 +237,8 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
238237
# - Data integrity is maintained during the merge operation
239238
# - Adding a unique cell id has column `Metadata_cell_id`
240239

240+
# We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis.
241+
241242
# In[5]:
242243

243244

@@ -256,10 +257,27 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
256257
# create an index columm and unique cell ID based on features of a single profiles
257258
cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)
258259

259-
# Split meta and features
260+
261+
# Next, we annotate the compound treatments in the CPJUMP1 dataset. We annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.
262+
#
263+
264+
# In[6]:
265+
266+
267+
# load drug repurposing moa file and add prefix to metadata columns
268+
rep_moa_df = pl.read_csv(
269+
drug_repurposing_config_path, separator="\t", skip_rows=9, encoding="utf8-lossy"
270+
).rename(lambda x: f"Metadata_{x}" if not x.startswith("Metadata_") else x)
271+
272+
# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname
273+
cpjump1_profiles = cpjump1_profiles.join(
274+
rep_moa_df, on="Metadata_pert_iname", how="left"
275+
)
276+
277+
# split meta and feature
260278
meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)
261279

262-
# Saving metadata and features of the concat profile into a json file
280+
# save the feature space information into a json file
263281
meta_features_dict = {
264282
"concat-profiles": {
265283
"meta-features": meta_cols,
@@ -269,7 +287,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
269287
with open(cpjump1_output_dir / "concat_profiles_meta_features.json", "w") as f:
270288
json.dump(meta_features_dict, f, indent=4)
271289

272-
# save as parquet with defined order of columns
290+
# save concatenated profiles
291+
# Loading compound profiles with shared features and concat into a single DataFrame
292+
concat_output_path = (
293+
cpjump1_output_dir / "cpjump1_compound_concat_profiles.parquet"
294+
).resolve()
273295
cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)
274296

275297

@@ -290,7 +312,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
290312
#
291313
# The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles.
292314

293-
# In[6]:
315+
# In[7]:
294316

295317

296318
# load in mitocheck profiles and save as parquet
@@ -334,7 +356,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
334356

335357
# Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis.
336358

337-
# In[7]:
359+
# In[8]:
338360

339361

340362
# Split profiles to only retain cell profiler features
@@ -357,7 +379,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
357379

358380
# Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles.
359381

360-
# In[8]:
382+
# In[9]:
361383

362384

363385
# manually selecting metadata features that are present across all 3 profiles
@@ -406,7 +428,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
406428
)
407429

408430

409-
# In[9]:
431+
# In[10]:
410432

411433

412434
# create concatenated mitocheck profiles
@@ -444,7 +466,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
444466
# - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication
445467
#
446468

447-
# In[10]:
469+
# In[11]:
448470

449471

450472
# load in cfret profiles and add a unique cell ID

utils/io_utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,14 @@
99
import yaml
1010
from tqdm import tqdm
1111

12+
from .data_utils import split_meta_and_features
13+
1214

1315
def load_profiles(
1416
fpath: str | pathlib.Path,
1517
convert_to_f32: bool = False,
1618
verbose: bool | None = False,
19+
shared_features: list[str] | None = None,
1720
) -> pl.DataFrame:
1821
"""Load single-cell profiles from given file path.
1922
@@ -29,6 +32,9 @@ def load_profiles(
2932
If True, converts all Float64 columns to Float32 to save memory. Default is False
3033
verbose : bool, optional
3134
If True, prints information about the loaded profiles. Default is False.
35+
shared_features : list[str] | None, optional
36+
If provided, only loads metadata columns and these specific feature columns.
37+
Default is None (loads all columns).
3238
3339
Returns
3440
-------
@@ -61,6 +67,11 @@ def load_profiles(
6167
# load profiles
6268
loaded_profiles = pl.read_parquet(fpath)
6369

70+
# filter to shared features if provided
71+
if shared_features is not None:
72+
meta_cols, _ = split_meta_and_features(loaded_profiles)
73+
loaded_profiles = loaded_profiles.select(meta_cols + shared_features)
74+
6475
# convert all Float64 columns to Float32 if convert_to_f32 is True
6576
if convert_to_f32:
6677
loaded_profiles = loaded_profiles.with_columns(

0 commit comments

Comments
 (0)