Added CPJUMP1 compound MoA metadata (#59)

axiomcura · jenna-tomkinson · web-flow · commit e6db454f8541 · 2026-01-06T17:43:57.000-05:00
* updated cpjump1 plates selection to only compound treated plates * updated download module; removed functions * fix metadata naming bug * updated nb configs * updated download module * updated control subsetting notebook * updated preprocessing module and reran pre-commit * reran notebooke #3 in module 0 * added moa infromation to cpjump compound data * added moas * updated * Update notebooks/0.download-data/nbconverted/2.preprocessing.py Co-authored-by: Jenna Tomkinson <107513215+jenna-tomkinson@users.noreply.github.com> --------- Co-authored-by: Jenna Tomkinson <107513215+jenna-tomkinson@users.noreply.github.com>
diff --git a/notebooks/0.download-data/2.preprocessing.ipynb b/notebooks/0.download-data/2.preprocessing.ipynb
@@ -36,7 +36,8 @@
     "import polars as pl\n",
     "\n",
     "sys.path.append(\"../../\")\n",
-    "from utils.data_utils import split_meta_and_features, add_cell_id_hash"
+    "from utils.data_utils import split_meta_and_features, add_cell_id_hash\n",
+    "from utils.io_utils import load_profiles"
    ]
   },
   {
@@ -93,15 +94,6 @@
     "                \"All elements in specific_plates must be pathlib.Path objects\"\n",
     "            )\n",
     "\n",
-    "    def load_profile(file: pathlib.Path) -> pl.DataFrame:\n",
-    "        \"\"\"internal function to load a single profile file.\"\"\"\n",
-    "        profile_df = pl.read_parquet(file)\n",
-    "        meta_cols, _ = split_meta_and_features(profile_df)\n",
-    "        if shared_features is not None:\n",
-    "            # Only select metadata and shared features\n",
-    "            return profile_df.select(meta_cols + shared_features)\n",
-    "        return profile_df\n",
-    "\n",
     "    # Use specific_plates if provided, otherwise gather all .parquet files\n",
     "    if specific_plates is not None:\n",
     "        # Validate that all specific plate files exist\n",
@@ -115,7 +107,9 @@
     "            raise FileNotFoundError(f\"No profile files found in {profile_dir}\")\n",
     "\n",
     "    # Load and concatenate profiles\n",
-    "    loaded_profiles = [load_profile(f) for f in files_to_load]\n",
+    "    loaded_profiles = [\n",
+    "        load_profiles(f, shared_features=shared_features) for f in files_to_load\n",
+    "    ]\n",
     "\n",
     "    # Concatenate all loaded profiles\n",
     "    return pl.concat(loaded_profiles, rechunk=True)\n",
@@ -205,6 +199,11 @@
     "# Setting profiles directory\n",
     "profiles_dir = (data_dir / \"sc-profiles\").resolve(strict=True)\n",
     "\n",
+    "# setting connectivity map drug repurposing config\n",
+    "drug_repurposing_config_path = (data_dir / \"repurposing_drugs_20180907.txt\").resolve(\n",
+    "    strict=True\n",
+    ")\n",
+    "\n",
     "# Experimental metadata\n",
     "exp_metadata_path = (\n",
     "    profiles_dir / \"cpjump1\" / \"cpjump1_compound_experimental-metadata.csv\"\n",
@@ -286,6 +285,14 @@
     "- Adding a unique cell id has column `Metadata_cell_id`"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "9ec882fa",
+   "metadata": {},
+   "source": [
+    "We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
@@ -306,12 +313,38 @@
     ")\n",
     "\n",
     "# create an index columm and unique cell ID based on features of a single profiles\n",
-    "cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)\n",
+    "cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3df9bbf5",
+   "metadata": {},
+   "source": [
+    "Next we annotate the compound treatments in the CPJUMP1 dataset, we annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "adfb9148",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load drug repurposing moa file and add prefix to metadata columns\n",
+    "rep_moa_df = pl.read_csv(\n",
+    "    drug_repurposing_config_path, separator=\"\\t\", skip_rows=9, encoding=\"utf8-lossy\"\n",
+    ").rename(lambda x: f\"Metadata_{x}\" if not x.startswith(\"Metadata_\") else x)\n",
+    "\n",
+    "# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname\n",
+    "cpjump1_profiles = cpjump1_profiles.join(\n",
+    "    rep_moa_df, on=\"Metadata_pert_iname\", how=\"left\"\n",
+    ")\n",
     "\n",
-    "# Split meta and features\n",
+    "# split meta and feature\n",
     "meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)\n",
     "\n",
-    "# Saving metadata and features of the concat profile into a json file\n",
+    "# save the feature space information into a json file\n",
     "meta_features_dict = {\n",
     "    \"concat-profiles\": {\n",
     "        \"meta-features\": meta_cols,\n",
@@ -321,7 +354,11 @@
     "with open(cpjump1_output_dir / \"concat_profiles_meta_features.json\", \"w\") as f:\n",
     "    json.dump(meta_features_dict, f, indent=4)\n",
     "\n",
-    "# save as parquet with defined order of columns\n",
+    "# save concatenated profiles\n",
+    "# Loading compound profiles with shared features and concat into a single DataFrame\n",
+    "concat_output_path = (\n",
+    "    cpjump1_output_dir / \"cpjump1_compound_concat_profiles.parquet\"\n",
+    ").resolve()\n",
     "cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)"
    ]
   },
@@ -350,7 +387,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "c5471d3e",
    "metadata": {},
    "outputs": [],
@@ -404,7 +441,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "c57da947",
    "metadata": {},
    "outputs": [],
@@ -437,7 +474,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "1d7ced04",
    "metadata": {},
    "outputs": [],
@@ -490,7 +527,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "42108980",
    "metadata": {},
    "outputs": [],
@@ -537,7 +574,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "1763d383",
    "metadata": {},
    "outputs": [],
diff --git a/notebooks/0.download-data/nbconverted/2.preprocessing.py b/notebooks/0.download-data/nbconverted/2.preprocessing.py
@@ -26,6 +26,7 @@
 
 sys.path.append("../../")
 from utils.data_utils import add_cell_id_hash, split_meta_and_features
+from utils.io_utils import load_profiles
 
 # ## Helper functions
 #
@@ -71,15 +72,6 @@ def load_and_concat_profiles(
                 "All elements in specific_plates must be pathlib.Path objects"
             )
 
-    def load_profile(file: pathlib.Path) -> pl.DataFrame:
-        """internal function to load a single profile file."""
-        profile_df = pl.read_parquet(file)
-        meta_cols, _ = split_meta_and_features(profile_df)
-        if shared_features is not None:
-            # Only select metadata and shared features
-            return profile_df.select(meta_cols + shared_features)
-        return profile_df
-
     # Use specific_plates if provided, otherwise gather all .parquet files
     if specific_plates is not None:
         # Validate that all specific plate files exist
@@ -93,7 +85,9 @@ def load_profile(file: pathlib.Path) -> pl.DataFrame:
             raise FileNotFoundError(f"No profile files found in {profile_dir}")
 
     # Load and concatenate profiles
-    loaded_profiles = [load_profile(f) for f in files_to_load]
+    loaded_profiles = [
+        load_profiles(f, shared_features=shared_features) for f in files_to_load
+    ]
 
     # Concatenate all loaded profiles
     return pl.concat(loaded_profiles, rechunk=True)
@@ -173,6 +167,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 # Setting profiles directory
 profiles_dir = (data_dir / "sc-profiles").resolve(strict=True)
 
+# setting connectivity map drug repurposing config
+drug_repurposing_config_path = (data_dir / "repurposing_drugs_20180907.txt").resolve(
+    strict=True
+)
+
 # Experimental metadata
 exp_metadata_path = (
     profiles_dir / "cpjump1" / "cpjump1_compound_experimental-metadata.csv"
@@ -238,6 +237,8 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 # - Data integrity is maintained during the merge operation
 # - Adding a unique cell id has column `Metadata_cell_id`
 
+# We are loading per-plate parquet profiles for compound-treated plates, selecting the shared feature set, concatenating them into a single Polars DataFrame while preserving metadata, and adding a unique Metadata_cell_id for each cell. The resulting cpjump1_profiles table is ready for downstream analysis.
+
 # In[5]:
 
 
@@ -256,10 +257,27 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 # create an index columm and unique cell ID based on features of a single profiles
 cpjump1_profiles = add_cell_id_hash(cpjump1_profiles)
 
-# Split meta and features
+
+# Next, we annotate the compound treatments in the CPJUMP1 dataset. We annotate each cell with Mechanism of Action (MoA) information using the [Clue Drug Repurposing Hub](https://clue.io/data/REP#REP). This resource provides comprehensive drug and tool compound annotations, including target information and clinical development status.
+#
+
+# In[6]:
+
+
+# load drug repurposing moa file and add prefix to metadata columns
+rep_moa_df = pl.read_csv(
+    drug_repurposing_config_path, separator="\t", skip_rows=9, encoding="utf8-lossy"
+).rename(lambda x: f"Metadata_{x}" if not x.startswith("Metadata_") else x)
+
+# merge the original cpjump1_profiles with rep_moa_df on Metadata_pert_iname
+cpjump1_profiles = cpjump1_profiles.join(
+    rep_moa_df, on="Metadata_pert_iname", how="left"
+)
+
+# split meta and feature
 meta_cols, features_cols = split_meta_and_features(cpjump1_profiles)
 
-# Saving metadata and features of the concat profile into a json file
+# save the feature space information into a json file
 meta_features_dict = {
     "concat-profiles": {
         "meta-features": meta_cols,
@@ -269,7 +287,11 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 with open(cpjump1_output_dir / "concat_profiles_meta_features.json", "w") as f:
     json.dump(meta_features_dict, f, indent=4)
 
-# save as parquet with defined order of columns
+# save concatenated profiles
+# Loading compound profiles with shared features and concat into a single DataFrame
+concat_output_path = (
+    cpjump1_output_dir / "cpjump1_compound_concat_profiles.parquet"
+).resolve()
 cpjump1_profiles.select(meta_cols + features_cols).write_parquet(concat_output_path)
 
 
@@ -290,7 +312,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 #
 # The preprocessing ensures that all MitoCheck datasets share a common feature space and are ready for comparative analysis with CPJUMP1 profiles.
 
-# In[6]:
+# In[7]:
 
 
 # load in mitocheck profiles and save as parquet
@@ -334,7 +356,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 # Filter Cell Profiler (CP) features and preprocess columns by removing the "CP__" prefix to standardize feature names for downstream analysis.
 
-# In[7]:
+# In[8]:
 
 
 # Split profiles to only retain cell profiler features
@@ -357,7 +379,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 
 # Splitting the metadata and feature columns for each dataset to enable targeted downstream analysis and ensure consistent data structure across all profiles.
 
-# In[8]:
+# In[9]:
 
 
 # manually selecting metadata features that are present across all 3 profiles
@@ -406,7 +428,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
     )
 
 
-# In[9]:
+# In[10]:
 
 
 # create concatenated mitocheck profiles
@@ -444,7 +466,7 @@ def remove_feature_prefixes(df: pl.DataFrame, prefix: str = "CP__") -> pl.DataFr
 # - **Unique cell identification**: Adding `Metadata_cell_id` column with unique hash values based on all profile features to enable precise cell tracking and deduplication
 #
 
-# In[10]:
+# In[11]:
 
 
 # load in cfret profiles and add a unique cell ID
diff --git a/utils/io_utils.py b/utils/io_utils.py
@@ -9,11 +9,14 @@
 import yaml
 from tqdm import tqdm
 
+from .data_utils import split_meta_and_features
+
 
 def load_profiles(
     fpath: str | pathlib.Path,
     convert_to_f32: bool = False,
     verbose: bool | None = False,
+    shared_features: list[str] | None = None,
 ) -> pl.DataFrame:
     """Load single-cell profiles from given file path.
 
@@ -29,6 +32,9 @@ def load_profiles(
         If True, converts all Float64 columns to Float32 to save memory. Default is False
     verbose : bool, optional
         If True, prints information about the loaded profiles. Default is False.
+    shared_features : list[str] | None, optional
+        If provided, only loads metadata columns and these specific feature columns.
+        Default is None (loads all columns).
 
     Returns
     -------
@@ -61,6 +67,11 @@ def load_profiles(
     # load profiles
     loaded_profiles = pl.read_parquet(fpath)
 
+    # filter to shared features if provided
+    if shared_features is not None:
+        meta_cols, _ = split_meta_and_features(loaded_profiles)
+        loaded_profiles = loaded_profiles.select(meta_cols + shared_features)
+
     # convert all Float64 columns to Float32 if convert_to_f32 is True
     if convert_to_f32:
         loaded_profiles = loaded_profiles.with_columns(