Skip to content

Commit efa4b88

Browse files
yonghaoygithub-actions
andauthored
add AoU snippets (#244)
Co-authored-by: github-actions <[email protected]>
1 parent 4cb26be commit efa4b88

File tree

59 files changed

+1358
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+1358
-1
lines changed

src/aou-common/extension-builder/snippets/example/hello.py

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import os
2+
import numpy as np
3+
import pandas as pd
4+
import pandas_profiling
5+
import plotnine
6+
from plotnine import * # Provides a ggplot-like interface to matplotlib.
7+
from IPython.display import display
8+
9+
## Plot setup.
10+
theme_set(theme_bw(base_size = 11)) # Default theme for plots.
11+
12+
def get_boxplot_fun_data(df):
13+
"""Returns a data frame with a y position and a label, for use annotating ggplot boxplots.
14+
15+
Args:
16+
d: A data frame.
17+
Returns:
18+
A data frame with column y as max and column label as length.
19+
"""
20+
d = {'y': max(df), 'label': f'N = {len(df)}'}
21+
return(pd.DataFrame(data=d, index=[0]))
22+
23+
# NOTE: if you get any errors from this cell, restart your kernel and run it again.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Use snippet 'add_age_to_demographics' to calculate the age of people in your demographics.
2+
# It assumes the 'Setup' snippet has been executed.
3+
# It also assumes that you got your demographics dataframe from Dataset Builder
4+
5+
# Note: This snippet calculates current age and does not take into account whether the person is already dead
6+
7+
8+
## -----[ CHANGE THE DATAFRAME NAME(S) `YOUR_DATASET_NAME_person_df` TO MATCH YOURS FROM DATASET BUILDER] -----
9+
YOUR_DATASET_NAME_person_df['age'] = pd.to_datetime('today').year - YOUR_DATASET_NAME_person_df['date_of_birth'].dt.year
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Use snippet 'join_dataframes' to join together two dataframes.
2+
# It assumes the 'Setup' snippet has been executed.
3+
#
4+
# In the example below, it joins Demographics '_person_df' and Measurements '_measurement_df' using
5+
# any columns they have in common, which in this case should only be 'person_id'.
6+
#
7+
# See also https://pandas.pydata.org/pandas-docs/version/0.25.1/reference/api/pandas.merge.html
8+
9+
10+
## -----[ CHANGE THE DATAFRAME NAME(S) TO MATCH YOURS FROM DATASET BUILDER] -----
11+
measurement_df = pd.merge(left=YOUR_DATASET_NAME_person_df, right=YOUR_DATASET_NAME_measurement_df, how='inner')
12+
13+
measurement_df.shape
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Use snippet 'summarize_a_dataframe' to display summary statistics for a dataframe.
2+
# It assumes snippet 'Setup' has been executed.
3+
# See also https://towardsdatascience.com/exploring-your-data-with-just-1-line-of-python-4b35ce21a82d
4+
5+
6+
## -----[ CHANGE THE DATAFRAME NAME(S) TO MATCH YOURS FROM DATASET BUILDER] -----
7+
YOUR_DATASET_NAME_person_df.loc[:10000,:].profile_report() # Examine up to the first 10,000 rows. Larger
8+
# dataframes can be profiled, but it takes more time.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Use snippet 'measurement_by_age_and_sex_at_birth' to plot joined demographics and measurements dataframes.
2+
# This plot assumes 'measurement_df' was created using snippet 'Basic operations -> join_dataframes' to
3+
# join together demographics and measurements dataframes.
4+
# See also https://plotnine.readthedocs.io/en/stable/
5+
6+
7+
# There could be many different measurements in the dataframe. By default, plot the first one.
8+
measurement_to_plot = measurement_df.standard_concept_name.unique()[0]
9+
10+
# Create a derived variable for age group.
11+
measurement_df['age_at_measurement'] = ((measurement_df['measurement_datetime'].dt.tz_localize(None)
12+
- measurement_df['date_of_birth'].dt.tz_localize(None)).dt.days)//365.24
13+
measurement_df['age_group'] = pd.cut(measurement_df['age_at_measurement'],
14+
[-np.inf, 34.5, 49.5, 64.5, np.inf],
15+
labels=["<35", "35-49", "50-64", "65+"])
16+
17+
# meas_filter is a column of True and False
18+
meas_filter = ((measurement_df.standard_concept_name == measurement_to_plot)
19+
& (measurement_df.unit_concept_name != 'No matching concept')
20+
& (measurement_df.unit_concept_name.notna())
21+
& (measurement_df.sex_at_birth != 'No matching concept')
22+
& (measurement_df.value_as_number < 9999999)
23+
& (measurement_df['age_at_measurement'].notnull()) # Get rid of nonsensical outliers.
24+
)
25+
26+
(ggplot(measurement_df[meas_filter], aes(x = 'age_group', y = 'value_as_number')) +
27+
geom_boxplot() +
28+
stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10,
29+
position = position_dodge(width = 0.9), va = 'top') +
30+
# scale_y_log10() + # Uncomment if the data looks skewed.
31+
coord_flip() +
32+
facet_wrap(['standard_concept_name + ": " + unit_concept_name', 'sex_at_birth'], ncol = 2, scales = 'free') +
33+
xlab('age group') +
34+
ggtitle('Numeric values of measurements by age and sex_at_birth\nSource: All Of Us Data') +
35+
theme(figure_size = (12, 12), panel_spacing = .5))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Use snippet 'measurement_by_sex_at_birth' to plot joined demographics and measurements dataframes.
2+
# This plot assumes 'measurement_df' was created using snippet 'Basic operations -> join_dataframes' to
3+
# join together demographics and measurements dataframes.
4+
# See also https://plotnine.readthedocs.io/en/stable/
5+
6+
7+
# There could be many different measurements in the dataframe. By default, plot the first one.
8+
measurement_to_plot = measurement_df.standard_concept_name.unique()[0]
9+
10+
# meas_filter is a column of True and False.
11+
meas_filter = ((measurement_df.standard_concept_name == measurement_to_plot)
12+
& (measurement_df.unit_concept_name != 'No matching concept')
13+
& (measurement_df.unit_concept_name.notna())
14+
& (measurement_df.value_as_number < 9999999) # Get rid of nonsensical outliers.
15+
)
16+
17+
(ggplot(measurement_df[meas_filter], aes(x = 'sex_at_birth', y = 'value_as_number')) +
18+
geom_boxplot() +
19+
stat_summary(fun_data = get_boxplot_fun_data, geom = 'text', size = 10,
20+
position = position_dodge(width = 0.9), va = 'top') +
21+
# scale_y_log10() + # Uncomment if the data looks skewed.
22+
facet_wrap(('standard_concept_name', 'unit_concept_name'), ncol = 2, scales = 'free') +
23+
ggtitle(f'Numeric values of measurements, by sex_at_birth\nSource: All Of Us Data') +
24+
theme(figure_size=(12, 6), panel_spacing = .5, axis_text_x = element_text(angle=25, hjust=1)))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Use snippet 'summarize_a_survey_module' to output a table and a graph of
2+
# participant counts by response for one question_concept_id
3+
# The snippet assumes that a dataframe containing survey questions and answers already exists
4+
# The snippet also assumes that setup has been run
5+
6+
# Update the next 3 lines
7+
survey_df = YOUR_DATASET_NAME_survey_df
8+
question_concept_id = 1585940
9+
denominator = None # e.g: 200000
10+
11+
####################################################################################
12+
# DON'T CHANGE FROM HERE
13+
####################################################################################
14+
def summarize_a_question_concept_id(df, question_concept_id, denominator=None):
15+
df = df.loc[df['question_concept_id'] == question_concept_id].copy()
16+
new_df = df.groupby(['answer_concept_id', 'answer'])['person_id']\
17+
.nunique()\
18+
.reset_index()\
19+
.rename(columns=dict(person_id='n_participant'))\
20+
.assign(answer_concept_id = lambda x: np.int32(x.answer_concept_id))
21+
if denominator:
22+
new_df['response_rate'] = round(100*new_df['n_participant']/denominator,2)
23+
if question_concept_id in df['question_concept_id'].unique():
24+
print(f"Distribution of response to {df.loc[df['question_concept_id'] == question_concept_id, 'question'].unique()[0]}")
25+
# show table
26+
display(new_df)
27+
# show graph
28+
display(ggplot(data=new_df) +
29+
geom_bar(aes(x='answer', y='n_participant'), stat='identity') +
30+
coord_flip() +
31+
labs(y="Participant count", x="") +
32+
theme_bw())
33+
else:
34+
print("There is an error with your question_concept_id")
35+
36+
summarize_a_question_concept_id(survey_df, question_concept_id, denominator)
37+
38+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Use snippet 'summarize_a_survey_module' to print a table of participant counts by question in a module
2+
# The snippet assumes that a dataframe containing survey questions and answers already exists
3+
4+
# Update the next 3 lines
5+
survey_df = YOUR_DATASET_NAME_survey_df
6+
module_name = 'The Basics' # e.g: 'The Basics', 'Lifestyle', 'Overall Health', etc.
7+
denominator = None # e.g: 200000
8+
9+
####################################################################################
10+
# DON'T CHANGE FROM HERE
11+
####################################################################################
12+
13+
def summarize_a_module(df, module=None, denominator=None):
14+
if module:
15+
df = df[df['survey'].str.lower() == module.lower()].copy()
16+
data = (df.groupby(['survey','question_concept_id','question'])['person_id'].nunique()
17+
.reset_index()
18+
.rename(columns={'person_id':'n_participant'}))
19+
if denominator:
20+
data['response_rate'] = round(100*data['n_participant']/denominator,2)
21+
return data
22+
23+
summarize_a_module(df=survey_df, module=module_name, denominator=denominator)
24+
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Documentation
2+
3+
[Documentation](https://aousupporthelp.zendesk.com/hc/en-us/articles/360039856791-How-to-use-code-snippets-in-Jupyter-Notebooks)

0 commit comments

Comments
 (0)