Added postgres concept creation.

stefanhgm · alistairewj · commit b78f297470ee · 2021-08-24T14:03:44.000-04:00
diff --git a/mimic-iv/concepts/Readme.md b/mimic-iv/concepts/Readme.md
@@ -0,0 +1,31 @@
+# MIMIC-IV Concepts
+
+This folder contains scripts to generate useful abstractions of raw MIMIC-IV data ("concepts"). The
+scripts are intended to be run against the BigQuery instantiation of MIMIC-IV, and are written in the BigQuery Standard SQL dialect. Concepts are categorized into folders if possible, otherwise they remain in the top-level directory.
+
+## Generating the concepts in PostgreSQL (*nix/Mac OS X)
+
+Analogously to [MIMIC-III Concepts](https://github.com/MIT-LCP/mimic-code/tree/master/concepts), the SQL scripts here are written in BigQuery's Standard SQL syntax, so that the following changes are necessary to make them compaible with PostgreSQL:
+
+* create postgres functions which emulate BigQuery functions (identical to MIMIC-III)
+* modify SQL scripts for incompatible syntax
+* run the modified SQL scripts and direct the output into tables in the PostgreSQL database
+
+This can be done as follows (again, analogously to [MIMIC-III](https://github.com/MIT-LCP/mimic-code/tree/master/concepts):
+
+1. Open a terminal in the `concepts` folder.
+2. Run [postgres-functions.sql](postgres-functions.sql).
+    * e.g. `psql -f postgres-functions.sql`
+    * This script creates functions which emulate BigQuery syntax.
+3. Run [postgres_make_concepts.sh](postgres_make_concepts.sh).
+    * e.g. `bash postgres_make_concepts.sh`
+    * This file runs the scripts after applying a few regular expressions which convert table references and date calculations appropriately.
+    * This file generates all concepts on the `public` schema.
+
+The main changes compared to MIMIC-III are slightly different regular expressions and a loop similar to [make_concepts.sh](make_concepts.sh). Also, one of them uses `perl` now, which might be necessary to install.
+
+### Known Problems
+
+* [postgres_make_concepts.sh](postgres_make_concepts.sh) fails for [suspicion_of_infection](sepsis/suspicion_of_infection.sql) due to `, DATETIME_TRUNC(abx.starttime, DAY) AS antibiotic_date`. As a consequence also [sepsis3](sepsis/sepsis3.sql) fails.
+* The script runs repeatetly for subfolders `score` and `sepsis` to handle interdependecies between tables. Running the concept scripts in the correct order can be improved.
+* The regular expressions in [postgres_make_concepts.sh](postgres_make_concepts.sh) depend on the current SQL scripts and might fail when they are changed.
diff --git a/mimic-iv/concepts/postgres-functions.sql b/mimic-iv/concepts/postgres-functions.sql
@@ -0,0 +1,157 @@
+-- Functions TODO:
+--  FROM table CROSS JOIN UNNEST(table.column) AS col -> ????  (see icustay-hours)
+--  ???(column) -> PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY column)    (not sure how to do median in BQ)
+
+SET search_path TO public;
+
+CREATE OR REPLACE FUNCTION REGEXP_EXTRACT(str TEXT, pattern TEXT) RETURNS TEXT AS $$
+BEGIN
+RETURN substring(str from pattern);
+END; $$
+LANGUAGE PLPGSQL;
+
+CREATE OR REPLACE FUNCTION REGEXP_CONTAINS(str TEXT, pattern TEXT) RETURNS BOOL AS $$
+BEGIN
+RETURN str ~ pattern;
+END; $$
+LANGUAGE PLPGSQL;
+
+-- alias generate_series with generate_array
+CREATE OR REPLACE FUNCTION GENERATE_ARRAY(i INTEGER, j INTEGER)
+RETURNS setof INTEGER language sql as $$
+    SELECT GENERATE_SERIES(i, j)
+$$;
+
+-- datetime functions
+CREATE OR REPLACE FUNCTION DATETIME(dt DATE) RETURNS TIMESTAMP(3) AS $$
+BEGIN
+RETURN CAST(dt AS TIMESTAMP(3));
+END; $$
+LANGUAGE PLPGSQL;
+
+CREATE OR REPLACE FUNCTION DATETIME(year INTEGER, month INTEGER, day INTEGER, hour INTEGER, minute INTEGER, second INTEGER) RETURNS TIMESTAMP(3) AS $$
+BEGIN
+RETURN TO_TIMESTAMP(
+    TO_CHAR(year, '0000') || TO_CHAR(month, '00') || TO_CHAR(day, '00') || TO_CHAR(hour, '00') || TO_CHAR(minute, '00') || TO_CHAR(second, '00'),
+    'yyyymmddHH24MISS'
+);
+END; $$
+LANGUAGE PLPGSQL;
+
+-- overload allowing string input
+
+--  DATETIME_ADD(datetime, INTERVAL 'n' DATEPART) -> datetime + INTERVAL 'n' DATEPART
+-- note: in bigquery, `INTERVAL 1 YEAR` is a valid interval
+-- but in postgres, it must be `INTERVAL '1' YEAR`
+CREATE OR REPLACE FUNCTION DATETIME_ADD(datetime_val TIMESTAMP(3), intvl INTERVAL) RETURNS TIMESTAMP(3) AS $$
+BEGIN
+RETURN datetime_val + intvl;
+END; $$
+LANGUAGE PLPGSQL;
+
+--  DATETIME_SUB(datetime, INTERVAL 'n' DATEPART) -> datetime - INTERVAL 'n' DATEPART
+CREATE OR REPLACE FUNCTION DATETIME_SUB(datetime_val TIMESTAMP(3), intvl INTERVAL) RETURNS TIMESTAMP(3) AS $$
+BEGIN
+RETURN datetime_val - intvl;
+END; $$
+LANGUAGE PLPGSQL;
+
+-- TODO:
+--   DATETIME_TRUNC(datetime, PART) -> DATE_TRUNC('datepart', datetime)
+
+-- below requires a regex to convert datepart from primitive to a string
+-- i.e. encapsulate it in single quotes
+CREATE OR REPLACE FUNCTION DATETIME_DIFF(endtime TIMESTAMP(3), starttime TIMESTAMP(3), datepart TEXT) RETURNS NUMERIC AS $$
+BEGIN
+RETURN 
+    EXTRACT(EPOCH FROM endtime - starttime) /
+    CASE
+        WHEN datepart = 'SECOND' THEN 1.0
+        WHEN datepart = 'MINUTE' THEN 60.0
+        WHEN datepart = 'HOUR' THEN 3600.0
+        WHEN datepart = 'DAY' THEN 24*3600.0
+        WHEN datepart = 'YEAR' THEN 365.242*24*3600.0
+    ELSE NULL END;
+END; $$
+LANGUAGE PLPGSQL;
+
+-- BigQuery has a custom data type, PART
+-- It's difficult to replicate this in postgresql, which recognizes the PART as a column name,
+-- unless it is within an EXTRACT() function.
+
+CREATE OR REPLACE FUNCTION BIGQUERY_FORMAT_TO_PSQL(format_str VARCHAR(255)) RETURNS TEXT AS $$
+BEGIN
+RETURN 
+    -- use replace to convert BigQuery string format to postgres string format
+    -- only handles a few cases since we don't extensively use this function
+    REPLACE(
+    REPLACE(
+    REPLACE(
+    REPLACE(
+    REPLACE(
+    REPLACE(
+        format_str
+        , '%S', 'SS'
+    )
+        , '%M', 'MI'
+    )
+        , '%H', 'HH24'
+    )
+        , '%d', 'dd'
+    )
+        , '%m', 'mm'
+    )
+        , '%Y', 'yyyy'
+    )
+;
+END; $$
+LANGUAGE PLPGSQL;
+
+
+CREATE OR REPLACE FUNCTION FORMAT_DATE(format_str VARCHAR(255), datetime_val TIMESTAMP(3)) RETURNS TEXT AS $$
+BEGIN
+RETURN TO_CHAR(
+    datetime_val,
+    -- use replace to convert BigQuery string format to postgres string format
+    -- only handles a few cases since we don't extensively use this function
+    BIGQUERY_FORMAT_TO_PSQL(format_str)
+);
+END; $$
+LANGUAGE PLPGSQL;
+
+
+CREATE OR REPLACE FUNCTION PARSE_DATE(format_str VARCHAR(255), string_val VARCHAR(255)) RETURNS DATE AS $$
+BEGIN
+RETURN TO_DATE(
+    string_val,
+    -- use replace to convert BigQuery string format to postgres string format
+    -- only handles a few cases since we don't extensively use this function
+    BIGQUERY_FORMAT_TO_PSQL(format_str)
+);
+END; $$
+LANGUAGE PLPGSQL;
+
+CREATE OR REPLACE FUNCTION FORMAT_DATETIME(format_str VARCHAR(255), datetime_val TIMESTAMP(3)) RETURNS TEXT AS $$
+BEGIN
+RETURN TO_CHAR(
+    datetime_val,
+    -- use replace to convert BigQuery string format to postgres string format
+    -- only handles a few cases since we don't extensively use this function
+    BIGQUERY_FORMAT_TO_PSQL(format_str)
+);
+END; $$
+LANGUAGE PLPGSQL;
+
+
+CREATE OR REPLACE FUNCTION PARSE_DATETIME(format_str VARCHAR(255), string_val VARCHAR(255)) RETURNS TIMESTAMP(3) AS $$
+BEGIN
+RETURN TO_TIMESTAMP(
+    string_val,
+    -- use replace to convert BigQuery string format to postgres string format
+    -- only handles a few cases since we don't extensively use this function
+    BIGQUERY_FORMAT_TO_PSQL(format_str)
+);
+END; $$
+LANGUAGE PLPGSQL;
+
+
diff --git a/mimic-iv/concepts/postgres_make_concepts.sh b/mimic-iv/concepts/postgres_make_concepts.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# This file makes tables for the concepts in this subfolder.
+# Be sure to run postgres-functions.sql first, as the concepts rely on those function definitions.
+# Note that this may take a large amount of time and hard drive space.
+
+# String replacements are necessary for some queries.
+export REGEX_SCHEMA='s/`physionet-data.(mimic_core|mimic_icu|mimic_derived|mimic_hosp).(.+?)`/\1.\2/g'
+# Note that these queries are very senstive to changes, e.g. adding whitespaces after comma can already change the behavior.
+export REGEX_DATETIME_DIFF="s/DATETIME_DIFF\((.+?),\s?(.+?),\s?(DAY|MINUTE|SECOND|HOUR|YEAR)\)/DATETIME_DIFF(\1,\2,'\3')/g"
+# Add necessary quotes to INTERVAL, e.g. "INTERVAL 5 hour" to "INTERVAL '5' hour"
+export REGEX_INTERVAL="s/interval\s([[:digit:]]+)\s(hour|day|month|year)/INTERVAL '\1' \2/gI"
+# Add numeric cast to ROUND(), e.g. "ROUND(1.234, 2)" to "ROUND( CAST(1.234 as numeric), 2)".
+export PERL_REGEX_ROUND='s/ROUND\(((.|\n)*?)\, /ROUND\( CAST\( \1 as numeric\)\,/g'
+# Specific queries for some problems that arose with some files.
+export REGEX_INT="s/CAST\(hr AS INT64\)/CAST\(hr AS bigint\)/g"
+export REGEX_ARRAY="s/GENERATE_ARRAY\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)/ARRAY\(SELECT \* FROM generate\_series\(-24, CEIL\(DATETIME\_DIFF\(it\.outtime_hr, it\.intime_hr, HOUR\)\)\)\)/g"
+export REGEX_HOUR_INTERVAL="s/INTERVAL CAST\(hr AS INT64\) HOUR/interval \'1\' hour * CAST\(hr AS bigint\)/g"
+export CONNSTR='-U postgres -h localhost -p 5500 -d mimic-iv'  # -d mimic
+
+# This is set as the search_path variable for psql.
+# A search path of "public,mimic_icu" will search both public and mimic_icu
+# schemas for data, but will create tables on the public schema.
+export PSQL_PREAMBLE='SET search_path TO public,mimic_icu'
+export TARGET_DATASET='mimic_derived'
+
+echo ''
+echo '==='
+echo 'Beginning to create tables for MIMIC database.'
+echo 'Any notices of the form "NOTICE: TABLE "XXXXXX" does not exist" can be ignored.'
+echo 'The scripts drop views before creating them, and these notices indicate nothing existed prior to creating the view.'
+echo '==='
+echo ''
+echo "Generating ${TARGET_DATASET}.icustay_times"
+{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.icustay_times; CREATE TABLE ${TARGET_DATASET}.icustay_times AS "; cat demographics/icustay_times.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" |  psql ${CONNSTR}
+
+echo "Generating ${TARGET_DATASET}.weight_durations"
+{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.weight_durations; CREATE TABLE ${TARGET_DATASET}.weight_durations AS "; cat demographics/weight_durations.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | psql ${CONNSTR}
+
+echo "Generating ${TARGET_DATASET}.urine_output"
+{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.urine_output; CREATE TABLE ${TARGET_DATASET}.urine_output AS "; cat measurement/urine_output.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" |  psql ${CONNSTR}
+
+# Explicit Regex for cast of second to 'second' in organfailure/kdigo_uo.
+export REGEX_SECONDS="s/SECOND\)/\'SECOND\'\)/g"
+echo "Generating ${TARGET_DATASET}.kdigo_uo"
+{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.kdigo_uo; CREATE TABLE ${TARGET_DATASET}.kdigo_uo AS "; cat organfailure/kdigo_uo.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | sed -r -e "${REGEX_SECONDS}" | psql ${CONNSTR}
+
+
+# generate tables in subfolders
+# order is important for a few tables here:
+# * firstday should go last
+# * sepsis depends on score (sofa.sql in particular)
+# * organfailure depends on measurement
+# * repeated score and sepsis at the end because some table interdepend on each other
+for d in demographics measurement comorbidity medication organfailure treatment score sepsis firstday score sepsis;
+do
+    for fn in `ls $d`;
+    do
+        echo "${d}"
+        # only run SQL queries
+        if [[ "${fn: -4}" == ".sql" ]]; then
+            # table name is file name minus extension
+            tbl="${fn::-4}"
+
+            # Create first_day_lab after measurements done and before it is used by scores.
+            if [[ "${tbl}" == "charlson" ]]; then
+                # Generate some tables first to prevent conflicts during processing.
+                # Have to replace column names. Probalby a mistake in the original SQL script.
+                export REGEX_LAB_1="s/abs_basophils/basophils_abs/g"
+                export REGEX_LAB_2="s/abs_eosinophils/eosinophils_abs/g"
+                export REGEX_LAB_3="s/abs_lymphocytes/lymphocytes_abs/g"
+                export REGEX_LAB_4="s/abs_monocytes/monocytes_abs/g"
+                export REGEX_LAB_5="s/abs_neutrophils/neutrophils_abs/g"
+                export REGEX_LAB_6="s/atyps/atypical_lymphocytes/g"
+                export REGEX_LAB_7="s/imm_granulocytes/immature_granulocytes/g"
+                export REGEX_LAB_8="s/metas/metamyelocytes/g"
+                echo "Generating ${TARGET_DATASET}.first_day_lab"
+                { echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.first_day_lab; CREATE TABLE ${TARGET_DATASET}.first_day_lab AS "; cat firstday/first_day_lab.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | sed -r -e "${REGEX_LAB_1}" | sed -r -e "${REGEX_LAB_2}" | sed -r -e "${REGEX_LAB_3}" | sed -r -e "${REGEX_LAB_4}" | sed -r -e "${REGEX_LAB_5}" | sed -r -e "${REGEX_LAB_6}" | sed -r -e "${REGEX_LAB_7}" | sed -r -e "${REGEX_LAB_8}" | perl -0777 -pe "${PERL_REGEX_ROUND}" |  psql ${CONNSTR}
+            fi
+
+            # skip first_day_sofa as it depends on other firstday queries, also skipped already processed tables.
+            if [[ "${tbl}" == "first_day_sofa" ]] || [[ "${tbl}" == "icustay_times" ]] || [[ "${tbl}" == "weight_durations" ]] || [[ "${tbl}" == "urine_output" ]] || [[ "${tbl}" == "kdigo_uo" ]] || [[ "${tbl}" == "first_day_lab" ]]; then
+                continue
+            fi
+            echo "Generating ${TARGET_DATASET}.${tbl}"
+            { echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.${tbl}; CREATE TABLE ${TARGET_DATASET}.${tbl} AS "; cat "${d}/${fn}";} | sed -r -e "${REGEX_ARRAY}" | sed -r -e "${REGEX_HOUR_INTERVAL}" | sed -r -e "${REGEX_INT}" | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" |  psql ${CONNSTR}
+        fi
+    done
+done
+
+
+# generate first_day_sofa table last
+echo "Generating ${TARGET_DATASET}.first_day_sofa"
+{ echo "${PSQL_PREAMBLE}; DROP TABLE IF EXISTS ${TARGET_DATASET}.first_day_sofa; CREATE TABLE ${TARGET_DATASET}.first_day_sofa AS "; cat firstday/first_day_sofa.sql;} | sed -r -e "${REGEX_DATETIME_DIFF}" | sed -r -e "${REGEX_SCHEMA}" | sed -r -e "${REGEX_INTERVAL}" | perl -0777 -pe "${PERL_REGEX_ROUND}" |  psql ${CONNSTR}