1+ """
2+ Collection of helper scripts to generate general statistics on the data
3+ contained in a CoderData Object.
4+ """
5+
6+
7+ from copy import deepcopy
8+
9+ import numpy as np
10+
11+ import pandas as pd
12+
13+ import matplotlib .pyplot as plt
14+ from matplotlib .axes import Axes
15+ import seaborn as sns
16+
17+ import coderdata as cd
18+
19+ def plot_2d_respones_metric (
20+ data : cd .Dataset ,
21+ metric1 : str ,
22+ metric2 : str ,
23+ ** kwargs : dict
24+ ) -> None :
25+
26+ data_plot = _prepare_2d_hist_data (
27+ data = data .experiments ,
28+ metrics = [metric1 , metric2 ],
29+ )
30+
31+ joint_bins = kwargs .get ('joint_bins' , 50 )
32+ marginal_bins = kwargs .get ('marginal_bins' , 50 )
33+
34+ sns .jointplot (
35+ data = data_plot ,
36+ x = metric2 ,
37+ y = metric1 ,
38+ kind = "hist" ,
39+ joint_kws = dict (bins = joint_bins ),
40+ marginal_kws = dict (bins = marginal_bins )
41+ )
42+
43+ def plot_response_metric (
44+ data : cd .Dataset ,
45+ metric : str = 'auc' ,
46+ ax : Axes = None ,
47+ ** kwargs : dict
48+ ) -> None :
49+ """
50+ Creates a histogram detailing the distribution of dose response
51+ values for a given dose respones metric.
52+
53+ If used in conjunction with `matplotlib.pyplot.subplot` or
54+ `matplotlib.pyplot.subplots` and the axes object is passed to the
55+ function, the function populates the axes object with the generated
56+ plot.
57+
58+ Parameters
59+ ----------
60+ data : coderdata.DataLoader
61+ A full CoderData object of a dataset
62+ metric : str, default='auc'
63+ A string that defines the response metric that should be plotted
64+ ax : matplotlib.axes.Axes, default=None
65+ An `Axes` object can be defined. This is uesful if a multipannel
66+ subplot has been defined prior via `matplotlib.pyplot.subplots`.
67+ Passing the location of the axes to the function will then
68+ populate the subplot at the given location with the generated
69+ plot.
70+ **kwargs : dict, optional
71+ Additional keyword arguments that can be passed to the function
72+ - bins : int - sets the number of bins; passed to
73+ `seaborn.histplot`
74+ - title : str - sets the title of the axes
75+ - kde : bool - adds a kernel density estimate plot into the
76+ histogram
77+
78+ Returns
79+ -------
80+ None
81+
82+ Example
83+ -------
84+ In a Jupyter Notebook environment the following snippet can be used
85+ to display a histgram detailing the distribution of drug response
86+ AUC measures in the beataml dataset.
87+
88+ >>> import coderdata as cd
89+ >>> beataml = cd.DataLoader('beataml')
90+ >>> cd.plot_response_metric(data=beataml, metric='auc', bin=10)
91+
92+ For generating multipanel plots we can make use of matplotlib and
93+ the `ax` parameter of this function. Furthermore, other features /
94+ parameters of the cerated figure can be changed (e.g. the title of
95+ the figure via `suptitle()`). Finally it can be saved.
96+
97+ >>> import coderdata as cd
98+ >>> import matplotlib.pyplot as plt
99+ >>> beataml = cd.DataLoader('beataml')
100+ >>> fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
101+ >>> plot_response_metric(
102+ ... data=beataml,
103+ ... metric='auc',
104+ ... bins=10,
105+ ... ax=axs[0]
106+ ... )
107+ >>> plot_response_metric(
108+ ... data=beataml,
109+ ... metric='aac',
110+ ... bins=10,
111+ ... ax=axs[0]
112+ ... )
113+ >>> fig.set_layout_engine('tight')
114+ >>> fig.suptitle('Distribution of drug response values')
115+ >>> fig.savefig('figure.png')
116+ """
117+
118+ # assinging values to variables based on **kwargs and defining
119+ # default values if not present in **kwargs
120+ bins_ = kwargs .get ('bins' , 10 )
121+ title_ = kwargs .get ('title' , None )
122+ kde_ = kwargs .get ('kde' , False )
123+
124+ # retrieving the data/values necessary to generate the figure
125+ metrics = (
126+ data .experiments # getting the experiments DF from the dataset
127+ .groupby ('dose_response_metric' ) # grouping for later
128+ )
129+ metric_ = metrics .get_group (metric ) # retrieving the desired group
130+ x = metric_ ['dose_response_value' ] # getting the values
131+
132+ sns .set_theme (palette = 'colorblind' )
133+ p = sns .histplot (data = x , kde = kde_ , bins = bins_ , ax = ax )
134+ p .set_xlabel (metric )
135+ p .set_title (title_ )
136+
137+
138+ def summarize_response_metric (data : cd .Dataset ) -> pd .DataFrame :
139+ """
140+ Helper function to extract basic statistics for the `experiments`
141+ object in a CoderData object. Uses `pandas.DataFrame.describe()`
142+ internally to generate count, mean, standard deviation, minimum,
143+ 25-, 50- and 75-percentile as well as maximum for
144+ `dose_response_value` for each `dose_response_metric` present in
145+ `experiments`.
146+
147+ Parameters
148+ ----------
149+ data : coderdata.cd.Dataset
150+ A full CoderData object of a dataset
151+
152+ Returns
153+ -------
154+ pandas.DataFrame
155+ A `pandas.DataFrame` containing basic statistics for each
156+ dose response metric.
157+
158+ Example
159+ -------
160+
161+ The Example assumes that a dataset with the prefix 'beataml' has
162+ been downloaded previously. See also ``coderdata.download()``
163+
164+ >>> import coderdata as cd
165+ >>> beataml = cd.DataLoader('beataml')
166+ >>> summary_stats = summarize_response_metric(data=beataml)
167+ >>> summary_stats
168+ count mean std
169+ dose_response_metric
170+ aac 23378.0 3.028061e-01 1.821265e-01 ...
171+ auc 23378.0 6.971939e-01 1.821265e-01 ...
172+ dss 23378.0 3.218484e-01 5.733492e-01 ...
173+ ... ... ... ... ...
174+ """
175+ df_ret = (
176+ data .experiments # get experiments DF
177+ .groupby ('dose_response_metric' ) # grouping by metric
178+ ['dose_response_value' ] # value to summarize
179+ .describe () # get count, mean, std, etc.
180+ )
181+
182+ return df_ret
183+
184+
185+ def _prepare_2d_hist_data (
186+ data : pd .DataFrame ,
187+ metrics : list [str ]= [
188+ "aac" , "auc" , "dss" ,
189+ "fit_auc" , "fit_ec50" , "fit_ec50se" ,
190+ "fit_einf" , "fit_hs" , "fit_ic50" ,
191+ "fit_r2" ,
192+ ],
193+ r2 : float = None ,
194+ ) -> pd .DataFrame :
195+
196+
197+ metric_groups = data .groupby ('dose_response_metric' )
198+
199+ if r2 is not None :
200+ r2_ = deepcopy (metric_groups .get_group ("fit_r2" ))
201+ r2_ .rename (columns = {"dose_response_value" : "r2_thresh" }, inplace = True )
202+ r2_ .drop (
203+ columns = [
204+ 'source' , 'time_unit' , 'dose_response_metric'
205+ ],
206+ inplace = True
207+ )
208+ # print(metric_groups)
209+ d_ret = deepcopy (metric_groups .get_group (metrics [0 ]))
210+ d_ret .rename (columns = {"dose_response_value" : metrics [0 ]}, inplace = True )
211+ d_ret .drop (columns = ["dose_response_metric" ], inplace = True )
212+
213+
214+ for metric in metrics [1 :]:
215+ m = deepcopy (metric_groups .get_group (metric ))
216+ m .rename (columns = {"dose_response_value" : metric }, inplace = True )
217+ m .drop (
218+ columns = [
219+ 'source' , 'time_unit' , 'dose_response_metric'
220+ ],
221+ inplace = True
222+ )
223+
224+ d_ret = d_ret .merge (m , on = ["improve_drug_id" , "improve_sample_id" , "time" , "study" ])
225+
226+ if r2 is not None :
227+ d_ret = d_ret .merge (r2_ , on = ["improve_drug_id" , "improve_sample_id" , "time" , "study" ])
228+ d_ret = d_ret [d_ret ["r2_thresh" ] > float (r2 )]
229+ d_ret .drop (columns = ["r2_thresh" ], inplace = True )
230+
231+
232+ return d_ret
0 commit comments