Skip to content

Commit 5d0e11d

Browse files
authored
Merge pull request #1363 from schu/schu/fix-mimic-iv-concepts-postgres-setup
mimic-iv/concepts: fix `postgres-make-concepts` and minor updates
2 parents 892c21c + 7d72fef commit 5d0e11d

25 files changed

+234
-82
lines changed

mimic-iv/concepts/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
This folder contains scripts to generate useful abstractions of raw MIMIC-IV data ("concepts").
44
The scripts are written using the **BigQuery Standard SQL Dialect**. Concepts are categorized into folders if possible, otherwise they remain in the top-level directory. The [postgres](/mimic-iv/concepts/postgres) subfolder contains automatically generated PostgreSQL versions of these scripts; [see below for how these were generated](#postgresql-concepts). Concepts are categorized into folders if possible, otherwise they remain in the top-level directory.
55

6-
The concepts are organized into individual SQL scripts, with each script generating a table. The BigQuery `mimic_derived` dataset under `physionet-data` contains the concepts pregenerated. Access to this dataset is available to MIMIC-IV approved users: see the [cloud instructions](https://mimic.mit.edu/docs/gettingstarted/cloud/) on how to access MIMIC-IV on BigQuery (which includes the derived concepts).
6+
The concepts are organized into individual SQL scripts, with each script generating a table. The BigQuery `mimiciv_derived` dataset under `physionet-data` contains the concepts pregenerated. Access to this dataset is available to MIMIC-IV approved users: see the [cloud instructions](https://mimic.mit.edu/docs/gettingstarted/cloud/) on how to access MIMIC-IV on BigQuery (which includes the derived concepts).
77

88
* [List of the concept folders and their content](#concept-index)
99
* [Generating the concept tables on BigQuery](#generating-the-concepts-on-bigquery)

mimic-iv/concepts/convert_bigquery_to_postgres.sh

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,10 @@
44
# String replacements are necessary for some queries.
55
export REGEX_SCHEMA='s/`physionet-data.(mimiciv_hosp|mimiciv_icu|mimiciv_derived).([A-Za-z0-9_-]+)`/\1.\2/g'
66
# Note that these queries are very senstive to changes, e.g. adding whitespaces after comma can already change the behavior.
7-
export REGEX_DATETIME_DIFF="s/DATETIME_DIFF\(([^,]+), ?([^,]+), ?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATETIME_DIFF(\1, \2, '\3')/g"
7+
export REGEX_DATETIME_DIFF="s/DATETIME_DIFF\(([^,]+), ?(.*), ?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATETIME_DIFF(\1, \2, '\3')/g"
88
export REGEX_DATETIME_TRUNC="s/DATETIME_TRUNC\(([^,]+), ?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATE_TRUNC('\2', \1)/g"
99
# Add necessary quotes to INTERVAL, e.g. "INTERVAL 5 hour" to "INTERVAL '5' hour"
1010
export REGEX_INTERVAL="s/interval ([[:digit:]]+) (hour|day|month|year)/INTERVAL '\1' \2/gI"
11-
# Add numeric cast to ROUND(), e.g. "ROUND(1.234, 2)" to "ROUND( CAST(1.234 as numeric), 2)".
12-
export PERL_REGEX_ROUND='s/ROUND\(((.|\n)*?)\, /ROUND\( CAST\( \1 as numeric\)\,/g'
1311
# Specific queries for some problems that arose with some files.
1412
export REGEX_INT="s/CAST\(hr AS INT64\)/CAST\(hr AS bigint\)/g"
1513
export REGEX_ARRAY="s/GENERATE_ARRAY\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)/ARRAY\(SELECT \* FROM generate\_series\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)\)/g"
@@ -51,7 +49,7 @@ do
5149

5250
# for two scripts, add a perl replace to cast rounded values as numeric
5351
if [[ "${tbl}" == "icustay_times" ]] || [[ "${tbl}" == "urine_output" ]]; then
54-
cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | sed -r -e "${REGEX_SECONDS}" | perl -0777 -pe "${PERL_REGEX_ROUND}" >> "postgres/${d}/${tbl}.sql"
52+
cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | sed -r -e "${REGEX_SECONDS}" >> "postgres/${d}/${tbl}.sql"
5553
else
5654
cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | sed -r -e "${REGEX_SECONDS}" >> "postgres/${d}/${tbl}.sql"
5755
fi
@@ -66,10 +64,19 @@ echo " done!"
6664
# (2) output to the postgres subfolder
6765
# (3) add a line to the postgres-make-concepts.sql script to generate this table
6866

69-
# order of the folders is important for a few tables here:
70-
# * scores (sofa et al) depend on labs, icustay_hourly
71-
# * sepsis depends on score (sofa.sql in particular)
72-
# * organfailure depends on measurement and firstday
67+
# we control the order by skipping tables listed in the below var
68+
DIR_AND_TABLES_TO_SKIP='demographics.icustay_times demographics.weight_durations measurement.urine_output organfailure.kdigo_uo organfailure.kdigo_stages firstday.first_day_sofa sepsis.sepsis3 medication.vasoactive_agent medication.norepinephrine_equivalent_dose'
69+
70+
# create an array to store tables for which the order of generation matters
71+
# i.e. these tables cannot be generated in alphabetical order, as done in the later loop
72+
TABLES_TO_SKIP=()
73+
for dir_and_table in $DIR_AND_TABLES_TO_SKIP;
74+
do
75+
tbl=`echo ${dir_and_table} | cut -d. -f2`
76+
TABLES_TO_SKIP+=($tbl)
77+
done
78+
79+
echo $TABLES_TO_SKIP
7380
# the order *only* matters during the conversion step because our loop is
7481
# inserting table build commands into the postgres-make-concepts.sql file
7582
for d in demographics measurement comorbidity medication treatment firstday organfailure score sepsis;
@@ -93,19 +100,24 @@ do
93100
echo -n " ${tbl} .."
94101
echo "-- THIS SCRIPT IS AUTOMATICALLY GENERATED. DO NOT EDIT IT DIRECTLY." > "postgres/${d}/${tbl}.sql"
95102
echo "DROP TABLE IF EXISTS ${tbl}; CREATE TABLE ${tbl} AS " >> "postgres/${d}/${tbl}.sql"
96-
cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" >> "postgres/${d}/${fn}"
103+
cat "${d}/${tbl}.sql" | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_DATETIME_TRUNC}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" >> "postgres/${d}/${fn}"
97104

98-
echo "\i ${d}/${fn}" >> postgres/postgres-make-concepts.sql
105+
if [[ ! " ${TABLES_TO_SKIP[*]} " =~ " ${tbl} " ]]; then
106+
# this table is *not* in our skip array
107+
# therefore, we print it out to the make concepts script
108+
echo "\i ${d}/${fn}" >> postgres/postgres-make-concepts.sql
109+
fi
99110
fi
100111
done
101112
echo " done!"
102113
done
103114

104115
# finally generate first_day_sofa which depends on concepts in firstday folder
105116
echo "" >> postgres/postgres-make-concepts.sql
106-
echo "-- final tables dependent on previous concepts" >> postgres/postgres-make-concepts.sql
117+
echo "-- final tables which were dependent on one or more prior tables" >> postgres/postgres-make-concepts.sql
107118

108-
for dir_and_table in firstday.first_day_sofa sepsis.sepsis3
119+
echo -n "final:"
120+
for dir_and_table in $DIR_AND_TABLES_TO_SKIP
109121
do
110122
d=`echo ${dir_and_table} | cut -d. -f1`
111123
tbl=`echo ${dir_and_table} | cut -d. -f2`

mimic-iv/concepts/demographics/icustay_detail.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ SELECT ie.subject_id, ie.hadm_id, ie.stay_id
1616

1717
-- icu level factors
1818
, ie.intime as icu_intime, ie.outtime as icu_outtime
19-
, ROUND(DATETIME_DIFF(ie.outtime, ie.intime, HOUR)/24.0, 2) as los_icu
19+
, ROUND(CAST(DATETIME_DIFF(ie.outtime, ie.intime, HOUR)/24.0 AS NUMERIC), 2) as los_icu
2020
, DENSE_RANK() OVER (PARTITION BY ie.hadm_id ORDER BY ie.intime) AS icustay_seq
2121

2222
-- first ICU stay *for the current hospitalization*

mimic-iv/concepts/firstday/first_day_height.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ WITH ce AS
2121
SELECT
2222
ie.subject_id
2323
, ie.stay_id
24-
, ROUND(AVG(height), 2) AS height
24+
, ROUND(CAST(AVG(height) AS NUMERIC), 2) AS height
2525
FROM `physionet-data.mimiciv_icu.icustays` ie
2626
LEFT JOIN `physionet-data.mimiciv_derived.height` ht
2727
ON ie.stay_id = ht.stay_id

mimic-iv/concepts/make_concepts.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
# This script generates the concepts in the BigQuery table mimic_derived.
2+
# This script generates the concepts in the BigQuery table mimiciv_derived.
33
export TARGET_DATASET=mimiciv_derived
44

55
# specify bigquery query command options

mimic-iv/concepts/measurement/blood_differential.sql

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -94,31 +94,31 @@ subject_id, hadm_id, charttime, specimen_id
9494

9595
, wbc
9696
-- impute absolute count if percentage & WBC is available
97-
, ROUND(CASE
97+
, ROUND(CAST(CASE
9898
WHEN basophils_abs IS NULL AND basophils IS NOT NULL AND impute_abs = 1
9999
THEN basophils * wbc / 100
100100
ELSE basophils_abs
101-
END, 4) AS basophils_abs
102-
, ROUND(CASE
101+
END AS NUMERIC), 4) AS basophils_abs
102+
, ROUND(CAST(CASE
103103
WHEN eosinophils_abs IS NULL AND eosinophils IS NOT NULL AND impute_abs = 1
104104
THEN eosinophils * wbc / 100
105105
ELSE eosinophils_abs
106-
END, 4) AS eosinophils_abs
107-
, ROUND(CASE
106+
END AS NUMERIC), 4) AS eosinophils_abs
107+
, ROUND(CAST(CASE
108108
WHEN lymphocytes_abs IS NULL AND lymphocytes IS NOT NULL AND impute_abs = 1
109109
THEN lymphocytes * wbc / 100
110110
ELSE lymphocytes_abs
111-
END, 4) AS lymphocytes_abs
112-
, ROUND(CASE
111+
END AS NUMERIC), 4) AS lymphocytes_abs
112+
, ROUND(CAST(CASE
113113
WHEN monocytes_abs IS NULL AND monocytes IS NOT NULL AND impute_abs = 1
114114
THEN monocytes * wbc / 100
115115
ELSE monocytes_abs
116-
END, 4) AS monocytes_abs
117-
, ROUND(CASE
116+
END AS NUMERIC), 4) AS monocytes_abs
117+
, ROUND(CAST(CASE
118118
WHEN neutrophils_abs IS NULL AND neutrophils IS NOT NULL AND impute_abs = 1
119119
THEN neutrophils * wbc / 100
120120
ELSE neutrophils_abs
121-
END, 4) AS neutrophils_abs
121+
END AS NUMERIC), 4) AS neutrophils_abs
122122

123123
, basophils
124124
, eosinophils

mimic-iv/concepts/measurement/height.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ WITH ht_in AS
44
SELECT
55
c.subject_id, c.stay_id, c.charttime
66
-- Ensure that all heights are in centimeters
7-
, ROUND(c.valuenum * 2.54, 2) AS height
7+
, ROUND(CAST(c.valuenum * 2.54 AS NUMERIC), 2) AS height
88
, c.valuenum as height_orig
99
FROM `physionet-data.mimiciv_icu.chartevents` c
1010
WHERE c.valuenum IS NOT NULL
@@ -16,7 +16,7 @@ WITH ht_in AS
1616
SELECT
1717
c.subject_id, c.stay_id, c.charttime
1818
-- Ensure that all heights are in centimeters
19-
, ROUND(c.valuenum, 2) AS height
19+
, ROUND(CAST(c.valuenum AS NUMERIC), 2) AS height
2020
FROM `physionet-data.mimiciv_icu.chartevents` c
2121
WHERE c.valuenum IS NOT NULL
2222
-- Height cm

mimic-iv/concepts/measurement/oxygen_delivery.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ with ce_stg1 as
6161
, itemid
6262
, value AS o2_device
6363
, ROW_NUMBER() OVER (PARTITION BY subject_id, charttime, itemid ORDER BY value) as rn
64-
FROM mimic_icu.chartevents
64+
FROM mimiciv_icu.chartevents
6565
WHERE itemid = 226732 -- oxygen delivery device(s)
6666
)
6767
, stg AS
@@ -95,4 +95,4 @@ SELECT
9595
, MAX(CASE WHEN rn = 4 THEN o2_device ELSE NULL END) AS o2_delivery_device_4
9696
FROM stg
9797
GROUP BY subject_id, charttime
98-
;
98+
;

mimic-iv/concepts/measurement/urine_output_rate.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,9 @@ select
8181
, CASE WHEN uo_tm_12hr >= 12 THEN ROUND(CAST((ur.urineoutput_12hr/wd.weight/uo_tm_12hr) AS NUMERIC), 4) END AS uo_mlkghr_12hr
8282
, CASE WHEN uo_tm_24hr >= 24 THEN ROUND(CAST((ur.urineoutput_24hr/wd.weight/uo_tm_24hr) AS NUMERIC), 4) END AS uo_mlkghr_24hr
8383
-- time of earliest UO measurement that was used to calculate the rate
84-
, ROUND(uo_tm_6hr, 2) AS uo_tm_6hr
85-
, ROUND(uo_tm_12hr, 2) AS uo_tm_12hr
86-
, ROUND(uo_tm_24hr, 2) AS uo_tm_24hr
84+
, ROUND(CAST(uo_tm_6hr AS NUMERIC), 2) AS uo_tm_6hr
85+
, ROUND(CAST(uo_tm_12hr AS NUMERIC), 2) AS uo_tm_12hr
86+
, ROUND(CAST(uo_tm_24hr AS NUMERIC), 2) AS uo_tm_24hr
8787
from ur_stg ur
8888
LEFT JOIN `physionet-data.mimiciv_derived.weight_durations` wd
8989
ON ur.stay_id = wd.stay_id

mimic-iv/concepts/measurement/vitalsign.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ select
1212
, AVG(case when itemid = 220180 and valuenum > 0 and valuenum < 300 then valuenum else null end) as dbp_ni
1313
, AVG(case when itemid = 220181 and valuenum > 0 and valuenum < 300 then valuenum else null end) as mbp_ni
1414
, AVG(case when itemid in (220210,224690) and valuenum > 0 and valuenum < 70 then valuenum else null end) as resp_rate
15-
, ROUND(
15+
, ROUND(CAST(
1616
AVG(case when itemid in (223761) and valuenum > 70 and valuenum < 120 then (valuenum-32)/1.8 -- converted to degC in valuenum call
1717
when itemid in (223762) and valuenum > 10 and valuenum < 50 then valuenum else null end)
18-
, 2) as temperature
18+
AS NUMERIC), 2) as temperature
1919
, MAX(CASE WHEN itemid = 224642 THEN value ELSE NULL END) AS temperature_site
2020
, AVG(case when itemid in (220277) and valuenum > 0 and valuenum <= 100 then valuenum else null end) as spo2
2121
, AVG(case when itemid in (225664,220621,226537) and valuenum > 0 then valuenum else null end) as glucose

0 commit comments

Comments
 (0)