Skip to content

Commit b78f297

Browse files
stefanhgmalistairewj
authored andcommitted
Added postgres concept creation.
1 parent de5d587 commit b78f297

File tree

3 files changed

+281
-0
lines changed

3 files changed

+281
-0
lines changed

mimic-iv/concepts/Readme.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# MIMIC-IV Concepts
2+
3+
This folder contains scripts to generate useful abstractions of raw MIMIC-IV data ("concepts"). The
4+
scripts are intended to be run against the BigQuery instantiation of MIMIC-IV, and are written in the BigQuery Standard SQL dialect. Concepts are categorized into folders if possible, otherwise they remain in the top-level directory.
5+
6+
## Generating the concepts in PostgreSQL (*nix/Mac OS X)
7+
8+
Analogously to [MIMIC-III Concepts](https://github.com/MIT-LCP/mimic-code/tree/master/concepts), the SQL scripts here are written in BigQuery's Standard SQL syntax, so that the following changes are necessary to make them compaible with PostgreSQL:
9+
10+
* create postgres functions which emulate BigQuery functions (identical to MIMIC-III)
11+
* modify SQL scripts for incompatible syntax
12+
* run the modified SQL scripts and direct the output into tables in the PostgreSQL database
13+
14+
This can be done as follows (again, analogously to [MIMIC-III](https://github.com/MIT-LCP/mimic-code/tree/master/concepts):
15+
16+
1. Open a terminal in the `concepts` folder.
17+
2. Run [postgres-functions.sql](postgres-functions.sql).
18+
* e.g. `psql -f postgres-functions.sql`
19+
* This script creates functions which emulate BigQuery syntax.
20+
3. Run [postgres_make_concepts.sh](postgres_make_concepts.sh).
21+
* e.g. `bash postgres_make_concepts.sh`
22+
* This file runs the scripts after applying a few regular expressions which convert table references and date calculations appropriately.
23+
* This file generates all concepts on the `public` schema.
24+
25+
The main changes compared to MIMIC-III are slightly different regular expressions and a loop similar to [make_concepts.sh](make_concepts.sh). Also, one of them uses `perl` now, which might be necessary to install.
26+
27+
### Known Problems
28+
29+
* [postgres_make_concepts.sh](postgres_make_concepts.sh) fails for [suspicion_of_infection](sepsis/suspicion_of_infection.sql) due to `, DATETIME_TRUNC(abx.starttime, DAY) AS antibiotic_date`. As a consequence also [sepsis3](sepsis/sepsis3.sql) fails.
30+
* The script runs repeatetly for subfolders `score` and `sepsis` to handle interdependecies between tables. Running the concept scripts in the correct order can be improved.
31+
* The regular expressions in [postgres_make_concepts.sh](postgres_make_concepts.sh) depend on the current SQL scripts and might fail when they are changed.
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
-- Functions TODO:
2+
-- FROM table CROSS JOIN UNNEST(table.column) AS col -> ???? (see icustay-hours)
3+
-- ???(column) -> PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY column) (not sure how to do median in BQ)
4+
5+
SET search_path TO public;
6+
7+
CREATE OR REPLACE FUNCTION REGEXP_EXTRACT(str TEXT, pattern TEXT) RETURNS TEXT AS $$
8+
BEGIN
9+
RETURN substring(str from pattern);
10+
END; $$
11+
LANGUAGE PLPGSQL;
12+
13+
CREATE OR REPLACE FUNCTION REGEXP_CONTAINS(str TEXT, pattern TEXT) RETURNS BOOL AS $$
14+
BEGIN
15+
RETURN str ~ pattern;
16+
END; $$
17+
LANGUAGE PLPGSQL;
18+
19+
-- alias generate_series with generate_array
20+
CREATE OR REPLACE FUNCTION GENERATE_ARRAY(i INTEGER, j INTEGER)
21+
RETURNS setof INTEGER language sql as $$
22+
SELECT GENERATE_SERIES(i, j)
23+
$$;
24+
25+
-- datetime functions
26+
CREATE OR REPLACE FUNCTION DATETIME(dt DATE) RETURNS TIMESTAMP(3) AS $$
27+
BEGIN
28+
RETURN CAST(dt AS TIMESTAMP(3));
29+
END; $$
30+
LANGUAGE PLPGSQL;
31+
32+
CREATE OR REPLACE FUNCTION DATETIME(year INTEGER, month INTEGER, day INTEGER, hour INTEGER, minute INTEGER, second INTEGER) RETURNS TIMESTAMP(3) AS $$
33+
BEGIN
34+
RETURN TO_TIMESTAMP(
35+
TO_CHAR(year, '0000') || TO_CHAR(month, '00') || TO_CHAR(day, '00') || TO_CHAR(hour, '00') || TO_CHAR(minute, '00') || TO_CHAR(second, '00'),
36+
'yyyymmddHH24MISS'
37+
);
38+
END; $$
39+
LANGUAGE PLPGSQL;
40+
41+
-- overload allowing string input
42+
43+
-- DATETIME_ADD(datetime, INTERVAL 'n' DATEPART) -> datetime + INTERVAL 'n' DATEPART
44+
-- note: in bigquery, `INTERVAL 1 YEAR` is a valid interval
45+
-- but in postgres, it must be `INTERVAL '1' YEAR`
46+
CREATE OR REPLACE FUNCTION DATETIME_ADD(datetime_val TIMESTAMP(3), intvl INTERVAL) RETURNS TIMESTAMP(3) AS $$
47+
BEGIN
48+
RETURN datetime_val + intvl;
49+
END; $$
50+
LANGUAGE PLPGSQL;
51+
52+
-- DATETIME_SUB(datetime, INTERVAL 'n' DATEPART) -> datetime - INTERVAL 'n' DATEPART
53+
CREATE OR REPLACE FUNCTION DATETIME_SUB(datetime_val TIMESTAMP(3), intvl INTERVAL) RETURNS TIMESTAMP(3) AS $$
54+
BEGIN
55+
RETURN datetime_val - intvl;
56+
END; $$
57+
LANGUAGE PLPGSQL;
58+
59+
-- TODO:
60+
-- DATETIME_TRUNC(datetime, PART) -> DATE_TRUNC('datepart', datetime)
61+
62+
-- below requires a regex to convert datepart from primitive to a string
63+
-- i.e. encapsulate it in single quotes
64+
CREATE OR REPLACE FUNCTION DATETIME_DIFF(endtime TIMESTAMP(3), starttime TIMESTAMP(3), datepart TEXT) RETURNS NUMERIC AS $$
65+
BEGIN
66+
RETURN
67+
EXTRACT(EPOCH FROM endtime - starttime) /
68+
CASE
69+
WHEN datepart = 'SECOND' THEN 1.0
70+
WHEN datepart = 'MINUTE' THEN 60.0
71+
WHEN datepart = 'HOUR' THEN 3600.0
72+
WHEN datepart = 'DAY' THEN 24*3600.0
73+
WHEN datepart = 'YEAR' THEN 365.242*24*3600.0
74+
ELSE NULL END;
75+
END; $$
76+
LANGUAGE PLPGSQL;
77+
78+
-- BigQuery has a custom data type, PART
79+
-- It's difficult to replicate this in postgresql, which recognizes the PART as a column name,
80+
-- unless it is within an EXTRACT() function.
81+
82+
CREATE OR REPLACE FUNCTION BIGQUERY_FORMAT_TO_PSQL(format_str VARCHAR(255)) RETURNS TEXT AS $$
83+
BEGIN
84+
RETURN
85+
-- use replace to convert BigQuery string format to postgres string format
86+
-- only handles a few cases since we don't extensively use this function
87+
REPLACE(
88+
REPLACE(
89+
REPLACE(
90+
REPLACE(
91+
REPLACE(
92+
REPLACE(
93+
format_str
94+
, '%S', 'SS'
95+
)
96+
, '%M', 'MI'
97+
)
98+
, '%H', 'HH24'
99+
)
100+
, '%d', 'dd'
101+
)
102+
, '%m', 'mm'
103+
)
104+
, '%Y', 'yyyy'
105+
)
106+
;
107+
END; $$
108+
LANGUAGE PLPGSQL;
109+
110+
111+
CREATE OR REPLACE FUNCTION FORMAT_DATE(format_str VARCHAR(255), datetime_val TIMESTAMP(3)) RETURNS TEXT AS $$
112+
BEGIN
113+
RETURN TO_CHAR(
114+
datetime_val,
115+
-- use replace to convert BigQuery string format to postgres string format
116+
-- only handles a few cases since we don't extensively use this function
117+
BIGQUERY_FORMAT_TO_PSQL(format_str)
118+
);
119+
END; $$
120+
LANGUAGE PLPGSQL;
121+
122+
123+
CREATE OR REPLACE FUNCTION PARSE_DATE(format_str VARCHAR(255), string_val VARCHAR(255)) RETURNS DATE AS $$
124+
BEGIN
125+
RETURN TO_DATE(
126+
string_val,
127+
-- use replace to convert BigQuery string format to postgres string format
128+
-- only handles a few cases since we don't extensively use this function
129+
BIGQUERY_FORMAT_TO_PSQL(format_str)
130+
);
131+
END; $$
132+
LANGUAGE PLPGSQL;
133+
134+
CREATE OR REPLACE FUNCTION FORMAT_DATETIME(format_str VARCHAR(255), datetime_val TIMESTAMP(3)) RETURNS TEXT AS $$
135+
BEGIN
136+
RETURN TO_CHAR(
137+
datetime_val,
138+
-- use replace to convert BigQuery string format to postgres string format
139+
-- only handles a few cases since we don't extensively use this function
140+
BIGQUERY_FORMAT_TO_PSQL(format_str)
141+
);
142+
END; $$
143+
LANGUAGE PLPGSQL;
144+
145+
146+
CREATE OR REPLACE FUNCTION PARSE_DATETIME(format_str VARCHAR(255), string_val VARCHAR(255)) RETURNS TIMESTAMP(3) AS $$
147+
BEGIN
148+
RETURN TO_TIMESTAMP(
149+
string_val,
150+
-- use replace to convert BigQuery string format to postgres string format
151+
-- only handles a few cases since we don't extensively use this function
152+
BIGQUERY_FORMAT_TO_PSQL(format_str)
153+
);
154+
END; $$
155+
LANGUAGE PLPGSQL;
156+
157+
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/bin/bash
2+
# This file makes tables for the concepts in this subfolder.
3+
# Be sure to run postgres-functions.sql first, as the concepts rely on those function definitions.
4+
# Note that this may take a large amount of time and hard drive space.
5+
6+
# String replacements are necessary for some queries.
7+
export REGEX_SCHEMA='s/`physionet-data.(mimic_core|mimic_icu|mimic_derived|mimic_hosp).(.+?)`/\1.\2/g'
8+
# Note that these queries are very senstive to changes, e.g. adding whitespaces after comma can already change the behavior.
9+
export REGEX_DATETIME_DIFF="s/DATETIME_DIFF\((.+?),\s?(.+?),\s?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATETIME_DIFF(\1,\2,'\3')/g"
10+
# Add necessary quotes to INTERVAL, e.g. "INTERVAL 5 hour" to "INTERVAL '5' hour"
11+
export REGEX_INTERVAL="s/interval\s([[:digit:]]+)\s(hour|day|month|year)/INTERVAL '\1' \2/gI"
12+
# Add numeric cast to ROUND(), e.g. "ROUND(1.234, 2)" to "ROUND( CAST(1.234 as numeric), 2)".
13+
export PERL_REGEX_ROUND='s/ROUND\(((.|\n)*?)\, /ROUND\( CAST\( \1 as numeric\)\,/g'
14+
# Specific queries for some problems that arose with some files.
15+
export REGEX_INT="s/CAST\(hr AS INT64\)/CAST\(hr AS bigint\)/g"
16+
export REGEX_ARRAY="s/GENERATE_ARRAY\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)/ARRAY\(SELECT \* FROM generate\_series\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)\)/g"
17+
export REGEX_HOUR_INTERVAL="s/INTERVAL CAST\(hr AS INT64\) HOUR/interval \'1\' hour * CAST\(hr AS bigint\)/g"
18+
export CONNSTR='-U postgres -h localhost -p 5500 -d mimic-iv' # -d mimic
19+
20+
# This is set as the search_path variable for psql.
21+
# A search path of "public,mimic_icu" will search both public and mimic_icu
22+
# schemas for data, but will create tables on the public schema.
23+
export PSQL_PREAMBLE='SET search_path TO public,mimic_icu'
24+
export TARGET_DATASET='mimic_derived'
25+
26+
echo ''
27+
echo '==='
28+
echo 'Beginning to create tables for MIMIC database.'
29+
echo 'Any notices of the form "NOTICE: TABLE "XXXXXX" does not exist" can be ignored.'
30+
echo 'The scripts drop views before creating them, and these notices indicate nothing existed prior to creating the view.'
31+
echo '==='
32+
echo ''
33+
echo "Generating ${TARGET_DATASET}.icustay_times"
34+
{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.icustay_times; CREATE TABLE ${TARGET_DATASET}.icustay_times AS "; cat demographics/icustay_times.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" | psql ${CONNSTR}
35+
36+
echo "Generating ${TARGET_DATASET}.weight_durations"
37+
{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.weight_durations; CREATE TABLE ${TARGET_DATASET}.weight_durations AS "; cat demographics/weight_durations.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | psql ${CONNSTR}
38+
39+
echo "Generating ${TARGET_DATASET}.urine_output"
40+
{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.urine_output; CREATE TABLE ${TARGET_DATASET}.urine_output AS "; cat measurement/urine_output.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" | psql ${CONNSTR}
41+
42+
# Explicit Regex for cast of second to 'second' in organfailure/kdigo_uo.
43+
export REGEX_SECONDS="s/SECOND\)/\'SECOND\'\)/g"
44+
echo "Generating ${TARGET_DATASET}.kdigo_uo"
45+
{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.kdigo_uo; CREATE TABLE ${TARGET_DATASET}.kdigo_uo AS "; cat organfailure/kdigo_uo.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | sed -r -e "${REGEX_SECONDS}" | psql ${CONNSTR}
46+
47+
48+
# generate tables in subfolders
49+
# order is important for a few tables here:
50+
# * firstday should go last
51+
# * sepsis depends on score (sofa.sql in particular)
52+
# * organfailure depends on measurement
53+
# * repeated score and sepsis at the end because some table interdepend on each other
54+
for d in demographics measurement comorbidity medication organfailure treatment score sepsis firstday score sepsis;
55+
do
56+
for fn in `ls $d`;
57+
do
58+
echo "${d}"
59+
# only run SQL queries
60+
if [[ "${fn: -4}" == ".sql" ]]; then
61+
# table name is file name minus extension
62+
tbl="${fn::-4}"
63+
64+
# Create first_day_lab after measurements done and before it is used by scores.
65+
if [[ "${tbl}" == "charlson" ]]; then
66+
# Generate some tables first to prevent conflicts during processing.
67+
# Have to replace column names. Probalby a mistake in the original SQL script.
68+
export REGEX_LAB_1="s/abs_basophils/basophils_abs/g"
69+
export REGEX_LAB_2="s/abs_eosinophils/eosinophils_abs/g"
70+
export REGEX_LAB_3="s/abs_lymphocytes/lymphocytes_abs/g"
71+
export REGEX_LAB_4="s/abs_monocytes/monocytes_abs/g"
72+
export REGEX_LAB_5="s/abs_neutrophils/neutrophils_abs/g"
73+
export REGEX_LAB_6="s/atyps/atypical_lymphocytes/g"
74+
export REGEX_LAB_7="s/imm_granulocytes/immature_granulocytes/g"
75+
export REGEX_LAB_8="s/metas/metamyelocytes/g"
76+
echo "Generating ${TARGET_DATASET}.first_day_lab"
77+
{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.first_day_lab; CREATE TABLE ${TARGET_DATASET}.first_day_lab AS "; cat firstday/first_day_lab.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | sed -r -e "${REGEX_LAB_1}" | sed -r -e "${REGEX_LAB_2}" | sed -r -e "${REGEX_LAB_3}" | sed -r -e "${REGEX_LAB_4}" | sed -r -e "${REGEX_LAB_5}" | sed -r -e "${REGEX_LAB_6}" | sed -r -e "${REGEX_LAB_7}" | sed -r -e "${REGEX_LAB_8}" | perl -0777 -pe "${PERL_REGEX_ROUND}" | psql ${CONNSTR}
78+
fi
79+
80+
# skip first_day_sofa as it depends on other firstday queries, also skipped already processed tables.
81+
if [[ "${tbl}" == "first_day_sofa" ]] || [[ "${tbl}" == "icustay_times" ]] || [[ "${tbl}" == "weight_durations" ]] || [[ "${tbl}" == "urine_output" ]] || [[ "${tbl}" == "kdigo_uo" ]] || [[ "${tbl}" == "first_day_lab" ]]; then
82+
continue
83+
fi
84+
echo "Generating ${TARGET_DATASET}.${tbl}"
85+
{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.${tbl}; CREATE TABLE ${TARGET_DATASET}.${tbl} AS "; cat "${d}/${fn}";} | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" | psql ${CONNSTR}
86+
fi
87+
done
88+
done
89+
90+
91+
# generate first_day_sofa table last
92+
echo "Generating ${TARGET_DATASET}.first_day_sofa"
93+
{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.first_day_sofa; CREATE TABLE ${TARGET_DATASET}.first_day_sofa AS "; cat firstday/first_day_sofa.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" | psql ${CONNSTR}

0 commit comments

Comments
 (0)