Skip to content

Commit df03c8f

Browse files
committed
refactor: bash script generates an sql file to create all psql concepts
1 parent 3c106bf commit df03c8f

File tree

5 files changed

+272
-225
lines changed

5 files changed

+272
-225
lines changed
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
#!/bin/bash
2+
# This shell script converts BigQuery .sql files into PostgreSQL .sql files.
3+
4+
# path in which we create the postgres concepts
5+
TARGET_PATH='../concepts_postgres'
6+
mkdir -p $TARGET_PATH
7+
8+
# String replacements are necessary for some queries.
9+
10+
# Schema replacement: change `physionet-data.<dataset>.<table>` to just <table> (with no backticks)
11+
export REGEX_SCHEMA='s/`physionet-data.(mimiciii_clinical|mimiciii_derived|mimiciii_notes).([A-Za-z0-9_-]+)`/\2/g'
12+
# Note that these queries are very senstive to changes, e.g. adding whitespaces after comma can already change the behavior.
13+
export REGEX_DATETIME_DIFF="s/DATETIME_DIFF\(([^,]+), ?(.*), ?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATETIME_DIFF(\1, \2, '\3')/g"
14+
export REGEX_DATETIME_TRUNC="s/DATETIME_TRUNC\(([^,]+), ?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATE_TRUNC('\2', \1)/g"
15+
# Add necessary quotes to INTERVAL, e.g. "INTERVAL 5 hour" to "INTERVAL '5' hour"
16+
export REGEX_INTERVAL="s/interval ([[:digit:]]+) (hour|day|month|year)/INTERVAL '\1' \2/gI"
17+
# Specific queries for some problems that arose with some files.
18+
export REGEX_INT="s/CAST\(hr AS INT64\)/CAST\(hr AS bigint\)/g"
19+
export REGEX_ARRAY="s/GENERATE_ARRAY\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)/ARRAY\(SELECT \* FROM generate\_series\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)\)/g"
20+
export REGEX_HOUR_INTERVAL="s/INTERVAL CAST\(hr AS INT64\) HOUR/interval \'1\' hour * CAST\(hr AS bigint\)/g"
21+
export REGEX_SECONDS="s/SECOND\)/\'SECOND\'\)/g"
22+
23+
# tables we want to run before all other concepts
24+
# usually because they are used as dependencies
25+
DIR_AND_TABLES_TO_PREBUILD='demographics.icustay_times demographics.icustay_hours .echo_data .code_status .rrt durations.weight_durations fluid_balance.urine_output organfailure.kdigo_uo'
26+
27+
# tables which are written directly in postgresql and source code controlled
28+
# this is usually because there is no trivial conversion between bq/psql syntax
29+
DIR_AND_TABLES_ALREADY_IN_PSQL='demographics.icustay_times demographics.icustay_hours demographics.note_counts diagnosis.ccs_dx'
30+
31+
# tables which we want to run after all other concepts
32+
# usually because they depend on one or more other queries
33+
DIR_AND_TABLES_TO_SKIP=''
34+
35+
# First, we re-create the postgres-make-concepts.sql file.
36+
echo "\echo ''" > $TARGET_PATH/postgres-make-concepts.sql
37+
38+
# Now we add some preamble for the user running the script.
39+
echo "\echo '==='" >> $TARGET_PATH/postgres-make-concepts.sql
40+
echo "\echo 'Beginning to create materialized views for MIMIC database.'" >> $TARGET_PATH/postgres-make-concepts.sql
41+
echo "\echo '"'Any notices of the form "NOTICE: materialized view "XXXXXX" does not exist" can be ignored.'"'" >> $TARGET_PATH/postgres-make-concepts.sql
42+
echo "\echo 'The scripts drop views before creating them, and these notices indicate nothing existed prior to creating the view.'" >> $TARGET_PATH/postgres-make-concepts.sql
43+
echo "\echo '==='" >> $TARGET_PATH/postgres-make-concepts.sql
44+
echo "\echo ''" >> $TARGET_PATH/postgres-make-concepts.sql
45+
46+
# ======================================== #
47+
# === CONCEPTS WHICH WE MUST RUN FIRST === #
48+
# ======================================== #
49+
echo -n "Dependencies:"
50+
51+
# output table creation calls to the make-concepts script
52+
echo "" >> $TARGET_PATH/postgres-make-concepts.sql
53+
echo "-- dependencies" >> $TARGET_PATH/postgres-make-concepts.sql
54+
55+
for dir_and_table in $DIR_AND_TABLES_TO_PREBUILD;
56+
do
57+
d=`echo ${dir_and_table} | cut -d. -f1`
58+
tbl=`echo ${dir_and_table} | cut -d. -f2`
59+
60+
if [[ $d == '' ]]; then
61+
d='.'
62+
fi
63+
64+
# make the sub-folder for postgres if it does not exist
65+
mkdir -p "$TARGET_PATH/${d}"
66+
67+
# convert the bigquery script to psql and output it to the appropriate subfolder
68+
echo -n " ${d}.${tbl} .."
69+
70+
# re-write the script into psql using regex
71+
# the if statement ensures we do not overwrite tables which are already written in psql
72+
if ! [[ "$DIR_AND_TABLES_ALREADY_IN_PSQL" =~ "$d.$tbl" ]]; then
73+
echo "-- THIS SCRIPT IS AUTOMATICALLY GENERATED. DO NOT EDIT IT DIRECTLY." > "${TARGET_PATH}/${d}/${tbl}.sql"
74+
echo "DROP TABLE IF EXISTS ${tbl}; CREATE TABLE ${tbl} AS " >> "${TARGET_PATH}/${d}/${tbl}.sql"
75+
cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" >> "${TARGET_PATH}/${d}/${tbl}.sql"
76+
fi
77+
78+
# write out a call to this script in the make concepts file
79+
echo "\i ${d}/${tbl}.sql" >> $TARGET_PATH/postgres-make-concepts.sql
80+
done
81+
echo " done!"
82+
83+
# ================================== #
84+
# === MAIN LOOP FOR ALL CONCEPTS === #
85+
# ================================== #
86+
87+
# Iterate through each concept subfolder, and:
88+
# (1) apply the above regular expressions to update the script
89+
# (2) output to the postgres subfolder
90+
# (3) add a line to the postgres-make-concepts.sql script to generate this table
91+
92+
# organfailure.kdigo_stages firstday.first_day_sofa sepsis.sepsis3 medication.vasoactive_agent medication.norepinephrine_equivalent_dose
93+
94+
# the order *only* matters during the conversion step because our loop is
95+
# inserting table build commands into the postgres-make-concepts.sql file
96+
for d in durations comorbidity demographics firstday fluid_balance sepsis diagnosis organfailure severityscores;
97+
do
98+
mkdir -p "$TARGET_PATH/${d}"
99+
echo -n "${d}:"
100+
echo "" >> $TARGET_PATH/postgres-make-concepts.sql
101+
echo "-- ${d}" >> $TARGET_PATH/postgres-make-concepts.sql
102+
for fn in `ls $d`;
103+
do
104+
# only run SQL queries
105+
if [[ "${fn: -4}" == ".sql" ]]; then
106+
# table name is file name minus extension
107+
tbl="${fn%????}"
108+
echo -n " ${tbl} "
109+
110+
if [[ "$DIR_AND_TABLES_TO_PREBUILD" =~ "$d.$tbl" ]]; then
111+
echo -n "(exists!) .."
112+
continue
113+
elif [[ "$DIR_AND_TABLES_TO_SKIP" =~ "$d.$tbl" ]]; then
114+
echo -n "(skipping!) .."
115+
continue
116+
else
117+
echo -n ".."
118+
fi
119+
120+
# re-write the script into psql using regex
121+
# the if statement ensures we do not overwrite tables which are already written in psql
122+
if ! [[ "$DIR_AND_TABLES_ALREADY_IN_PSQL" =~ "$d.$tbl" ]]; then
123+
echo "-- THIS SCRIPT IS AUTOMATICALLY GENERATED. DO NOT EDIT IT DIRECTLY." > "${TARGET_PATH}/${d}/${tbl}.sql"
124+
echo "DROP TABLE IF EXISTS ${tbl}; CREATE TABLE ${tbl} AS " >> "${TARGET_PATH}/${d}/${tbl}.sql"
125+
cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" >> "${TARGET_PATH}/${d}/${fn}"
126+
fi
127+
128+
# add statement to generate this table to make concepts script
129+
echo "\i ${d}/${fn}" >> ${TARGET_PATH}/postgres-make-concepts.sql
130+
fi
131+
done
132+
echo " done!"
133+
done
134+
135+
# finally generate first_day_sofa which depends on concepts in firstday folder
136+
echo "" >> ${TARGET_PATH}/postgres-make-concepts.sql
137+
echo "-- final tables which were dependent on one or more prior tables" >> ${TARGET_PATH}/postgres-make-concepts.sql
138+
139+
echo -n "final:"
140+
for dir_and_table in $DIR_AND_TABLES_TO_SKIP
141+
do
142+
d=`echo ${dir_and_table} | cut -d. -f1`
143+
tbl=`echo ${dir_and_table} | cut -d. -f2`
144+
145+
# make the sub-folder for postgres if it does not exist
146+
mkdir -p "$TARGET_PATH/${d}"
147+
148+
# convert the bigquery script to psql and output it to the appropriate subfolder
149+
echo -n " ${d}.${tbl} .."
150+
if ! [[ "$DIR_AND_TABLES_ALREADY_IN_PSQL" =~ "$d.$tbl" ]]; then
151+
echo "-- THIS SCRIPT IS AUTOMATICALLY GENERATED. DO NOT EDIT IT DIRECTLY." > "${TARGET_PATH}/${d}/${tbl}.sql"
152+
echo "DROP TABLE IF EXISTS ${tbl}; CREATE TABLE ${tbl} AS " >> "${TARGET_PATH}/${d}/${tbl}.sql"
153+
cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" >> "${TARGET_PATH}/${d}/${fn}"
154+
fi
155+
# write out a call to this script in the make concepts file
156+
echo "\i ${d}/${tbl}.sql" >> $TARGET_PATH/postgres-make-concepts.sql
157+
done
158+
echo " done!"

0 commit comments

Comments
 (0)