Skip to content

Commit e3f4df3

Browse files
committed
Merge remote-tracking branch 'origin/main' into build_2.2_bug_fixes
2 parents d628dd3 + a408582 commit e3f4df3

File tree

7 files changed

+1676
-884
lines changed

7 files changed

+1676
-884
lines changed

build/pancpdo/README.md

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,32 @@
1-
## HCMI Data
1+
## Pancreatic PDO Data
2+
3+
4+
Here we will store the scripts required to process the omics data from the
5+
Genomic Data Commons together with the drug response data.
6+
7+
The GDC hosts the panc pdo omcis data, so to update we need an
8+
up-to-date manifest, obtained as follows:
9+
10+
11+
1. Navigate to the [GDC Data
12+
Portal](https://portal.gdc.cancer.gov/analysis_page?app=Projects),
13+
and select 'ORGANOID-PANCREATIC'
14+
2. Click on the 'Cases' button, and select the download button where
15+
it lists the number of files.
16+
3. This will download the ENTIRE Manifest
17+
4. Filter the manifest for RNASeq, WGS mutations, and copy number
18+
(though i dont think thi dataset has copy number)
19+
calls using the following command:
20+
```
21+
cat ~gdc_manifest.2025-07-08.091940.txt | grep 'rna_seq\|md5'
22+
| 'grep counts\|md5' | grep 'txt\|maf\|tsv\|md5' > new_manifest.txt
23+
cp new_manifest.txt full_manifest.txt
24+
25+
```
26+
27+
The other data is stored [on synapse](https://www.synapse.org/Synapse:syn64597875).
28+
229

3-
Here we will store the scripts required to process the data from the [Human Cancer Models Initiative](https://ocg.cancer.gov/programs/HCMI)
430

531
## Build Docker
632

build/pancpdo/full_manifest.txt

Lines changed: 31 additions & 872 deletions
Large diffs are not rendered by default.

coderdata/__init__.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,14 @@
1313

1414

1515
from .utils.utils import version
16-
from .utils.utils import list_datasets
16+
from .utils.utils import list_datasets
17+
18+
try:
19+
import matplotlib
20+
import seaborn as sns
21+
except ModuleNotFoundError:
22+
pass
23+
else:
24+
from .utils.stats import summarize_response_metric
25+
from .utils.stats import plot_response_metric
26+
from .utils.stats import plot_2d_respones_metric

coderdata/utils/__init__.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,17 @@
11
from .utils import version
2-
from .utils import list_datasets
2+
from .utils import list_datasets
3+
4+
try:
5+
import matplotlib
6+
import seaborn as sns
7+
except ModuleNotFoundError:
8+
import warnings
9+
warnings.warn(
10+
"package was not availble. To use coderdata.utils.stats functions "
11+
"please make sure 'matplotlib' & 'seaborn' are available in the "
12+
"environment."
13+
)
14+
else:
15+
from .stats import summarize_response_metric
16+
from .stats import plot_response_metric
17+
from .stats import plot_2d_respones_metric

coderdata/utils/stats.py

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
"""
2+
Collection of helper scripts to generate general statistics on the data
3+
contained in a CoderData Object.
4+
"""
5+
6+
7+
from copy import deepcopy
8+
9+
import numpy as np
10+
11+
import pandas as pd
12+
13+
import matplotlib.pyplot as plt
14+
from matplotlib.axes import Axes
15+
import seaborn as sns
16+
17+
import coderdata as cd
18+
19+
def plot_2d_respones_metric(
20+
data: cd.Dataset,
21+
metric1: str,
22+
metric2: str,
23+
**kwargs: dict
24+
) -> None:
25+
26+
data_plot = _prepare_2d_hist_data(
27+
data=data.experiments,
28+
metrics = [metric1, metric2],
29+
)
30+
31+
joint_bins = kwargs.get('joint_bins', 50)
32+
marginal_bins = kwargs.get('marginal_bins', 50)
33+
34+
sns.jointplot(
35+
data=data_plot,
36+
x=metric2,
37+
y=metric1,
38+
kind="hist",
39+
joint_kws=dict(bins=joint_bins),
40+
marginal_kws=dict(bins=marginal_bins)
41+
)
42+
43+
def plot_response_metric(
44+
data: cd.Dataset,
45+
metric: str='auc',
46+
ax: Axes=None,
47+
**kwargs: dict
48+
) -> None:
49+
"""
50+
Creates a histogram detailing the distribution of dose response
51+
values for a given dose respones metric.
52+
53+
If used in conjunction with `matplotlib.pyplot.subplot` or
54+
`matplotlib.pyplot.subplots` and the axes object is passed to the
55+
function, the function populates the axes object with the generated
56+
plot.
57+
58+
Parameters
59+
----------
60+
data : coderdata.DataLoader
61+
A full CoderData object of a dataset
62+
metric : str, default='auc'
63+
A string that defines the response metric that should be plotted
64+
ax : matplotlib.axes.Axes, default=None
65+
An `Axes` object can be defined. This is uesful if a multipannel
66+
subplot has been defined prior via `matplotlib.pyplot.subplots`.
67+
Passing the location of the axes to the function will then
68+
populate the subplot at the given location with the generated
69+
plot.
70+
**kwargs : dict, optional
71+
Additional keyword arguments that can be passed to the function
72+
- bins : int - sets the number of bins; passed to
73+
`seaborn.histplot`
74+
- title : str - sets the title of the axes
75+
- kde : bool - adds a kernel density estimate plot into the
76+
histogram
77+
78+
Returns
79+
-------
80+
None
81+
82+
Example
83+
-------
84+
In a Jupyter Notebook environment the following snippet can be used
85+
to display a histgram detailing the distribution of drug response
86+
AUC measures in the beataml dataset.
87+
88+
>>> import coderdata as cd
89+
>>> beataml = cd.DataLoader('beataml')
90+
>>> cd.plot_response_metric(data=beataml, metric='auc', bin=10)
91+
92+
For generating multipanel plots we can make use of matplotlib and
93+
the `ax` parameter of this function. Furthermore, other features /
94+
parameters of the cerated figure can be changed (e.g. the title of
95+
the figure via `suptitle()`). Finally it can be saved.
96+
97+
>>> import coderdata as cd
98+
>>> import matplotlib.pyplot as plt
99+
>>> beataml = cd.DataLoader('beataml')
100+
>>> fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
101+
>>> plot_response_metric(
102+
... data=beataml,
103+
... metric='auc',
104+
... bins=10,
105+
... ax=axs[0]
106+
... )
107+
>>> plot_response_metric(
108+
... data=beataml,
109+
... metric='aac',
110+
... bins=10,
111+
... ax=axs[0]
112+
... )
113+
>>> fig.set_layout_engine('tight')
114+
>>> fig.suptitle('Distribution of drug response values')
115+
>>> fig.savefig('figure.png')
116+
"""
117+
118+
# assinging values to variables based on **kwargs and defining
119+
# default values if not present in **kwargs
120+
bins_ = kwargs.get('bins', 10)
121+
title_ = kwargs.get('title', None)
122+
kde_ = kwargs.get('kde', False)
123+
124+
# retrieving the data/values necessary to generate the figure
125+
metrics = (
126+
data.experiments # getting the experiments DF from the dataset
127+
.groupby('dose_response_metric') # grouping for later
128+
)
129+
metric_ = metrics.get_group(metric) # retrieving the desired group
130+
x = metric_['dose_response_value'] # getting the values
131+
132+
sns.set_theme(palette='colorblind')
133+
p = sns.histplot(data=x, kde=kde_, bins=bins_, ax=ax)
134+
p.set_xlabel(metric)
135+
p.set_title(title_)
136+
137+
138+
def summarize_response_metric(data: cd.Dataset) -> pd.DataFrame:
139+
"""
140+
Helper function to extract basic statistics for the `experiments`
141+
object in a CoderData object. Uses `pandas.DataFrame.describe()`
142+
internally to generate count, mean, standard deviation, minimum,
143+
25-, 50- and 75-percentile as well as maximum for
144+
`dose_response_value` for each `dose_response_metric` present in
145+
`experiments`.
146+
147+
Parameters
148+
----------
149+
data : coderdata.cd.Dataset
150+
A full CoderData object of a dataset
151+
152+
Returns
153+
-------
154+
pandas.DataFrame
155+
A `pandas.DataFrame` containing basic statistics for each
156+
dose response metric.
157+
158+
Example
159+
-------
160+
161+
The Example assumes that a dataset with the prefix 'beataml' has
162+
been downloaded previously. See also ``coderdata.download()``
163+
164+
>>> import coderdata as cd
165+
>>> beataml = cd.DataLoader('beataml')
166+
>>> summary_stats = summarize_response_metric(data=beataml)
167+
>>> summary_stats
168+
count mean std
169+
dose_response_metric
170+
aac 23378.0 3.028061e-01 1.821265e-01 ...
171+
auc 23378.0 6.971939e-01 1.821265e-01 ...
172+
dss 23378.0 3.218484e-01 5.733492e-01 ...
173+
... ... ... ... ...
174+
"""
175+
df_ret = (
176+
data.experiments # get experiments DF
177+
.groupby('dose_response_metric') # grouping by metric
178+
['dose_response_value'] # value to summarize
179+
.describe() # get count, mean, std, etc.
180+
)
181+
182+
return df_ret
183+
184+
185+
def _prepare_2d_hist_data(
186+
data: pd.DataFrame,
187+
metrics: list[str]=[
188+
"aac", "auc", "dss",
189+
"fit_auc", "fit_ec50", "fit_ec50se",
190+
"fit_einf", "fit_hs", "fit_ic50",
191+
"fit_r2",
192+
],
193+
r2: float=None,
194+
) -> pd.DataFrame:
195+
196+
197+
metric_groups = data.groupby('dose_response_metric')
198+
199+
if r2 is not None:
200+
r2_ = deepcopy(metric_groups.get_group("fit_r2"))
201+
r2_.rename(columns={"dose_response_value": "r2_thresh"}, inplace=True)
202+
r2_.drop(
203+
columns=[
204+
'source', 'time_unit', 'dose_response_metric'
205+
],
206+
inplace=True
207+
)
208+
# print(metric_groups)
209+
d_ret = deepcopy(metric_groups.get_group(metrics[0]))
210+
d_ret.rename(columns={"dose_response_value": metrics[0]}, inplace=True)
211+
d_ret.drop(columns=["dose_response_metric"], inplace=True)
212+
213+
214+
for metric in metrics[1:]:
215+
m = deepcopy(metric_groups.get_group(metric))
216+
m.rename(columns={"dose_response_value": metric}, inplace=True)
217+
m.drop(
218+
columns=[
219+
'source', 'time_unit', 'dose_response_metric'
220+
],
221+
inplace=True
222+
)
223+
224+
d_ret = d_ret.merge(m, on=["improve_drug_id", "improve_sample_id", "time", "study"])
225+
226+
if r2 is not None:
227+
d_ret = d_ret.merge(r2_, on=["improve_drug_id", "improve_sample_id", "time", "study"])
228+
d_ret = d_ret[d_ret["r2_thresh"] > float(r2)]
229+
d_ret.drop(columns=["r2_thresh"], inplace=True)
230+
231+
232+
return d_ret

0 commit comments

Comments
 (0)