Skip to content

Commit 72d51fe

Browse files
authored
Merge pull request #1784 from MIT-LCP/sqlite_python_improvements
SQLite python improvements
2 parents 54bfe2a + edbc9cb commit 72d51fe

File tree

1 file changed

+160
-24
lines changed

1 file changed

+160
-24
lines changed
Lines changed: 160 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,171 @@
1+
from argparse import ArgumentParser
2+
import json
13
import os
4+
from pathlib import Path
25
import sqlite3
36
import sys
4-
5-
from glob import glob
7+
import typing as t
68
import pandas as pd
79

810
DATABASE_NAME = "mimic4.db"
911
THRESHOLD_SIZE = 5 * 10**7
1012
CHUNKSIZE = 10**6
1113

12-
if os.path.exists(DATABASE_NAME):
13-
msg = "File {} already exists.".format(DATABASE_NAME)
14-
print(msg)
15-
sys.exit()
16-
17-
with sqlite3.Connection(DATABASE_NAME) as connection:
18-
for f in glob("**/*.csv*", recursive=True):
19-
print("Starting processing {}".format(f))
20-
folder, filename = os.path.split(f)
21-
tablename = filename.lower()
22-
if tablename.endswith('.gz'):
23-
tablename = tablename[:-3]
24-
if tablename.endswith('.csv'):
25-
tablename = tablename[:-4]
26-
if os.path.getsize(f) < THRESHOLD_SIZE:
27-
df = pd.read_csv(f)
28-
df.to_sql(tablename, connection)
14+
_MIMIC_TABLES = (
15+
# hospital EHR derived tables
16+
'admissions',
17+
'd_hcpcs',
18+
'd_icd_diagnoses',
19+
'd_icd_procedures',
20+
'd_labitems',
21+
'diagnoses_icd',
22+
'drgcodes',
23+
'emar',
24+
'emar_detail',
25+
'hcpcsevents',
26+
'labevents',
27+
'microbiologyevents',
28+
'omr',
29+
'patients',
30+
'pharmacy',
31+
'poe',
32+
'poe_detail',
33+
'prescriptions',
34+
'procedures_icd',
35+
'provider',
36+
'services',
37+
'transfers',
38+
# ICU derived tables
39+
'caregiver',
40+
'chartevents',
41+
'd_items',
42+
'datetimeevents',
43+
'icustays',
44+
'ingredientevents',
45+
'inputevents',
46+
'outputevents',
47+
'procedureevents',
48+
)
49+
50+
def process_dataframe(df: pd.DataFrame, subjects: t.Optional[t.List[int]] = None) -> pd.DataFrame:
51+
for c in df.columns:
52+
if c.endswith('time') or c.endswith('date'):
53+
df[c] = pd.to_datetime(df[c], format='ISO8601')
54+
55+
if subjects is not None and 'subject_id' in df:
56+
df = df.loc[df['subject_id'].isin(subjects)]
57+
58+
return df
59+
60+
def main():
61+
argparser = ArgumentParser()
62+
argparser.add_argument(
63+
'--limit', type=int, default=0,
64+
help='Restrict the database to the first N subject_id.'
65+
)
66+
argparser.add_argument(
67+
'--data_dir', type=str, default='.',
68+
help='Path to the directory containing the MIMIC-IV CSV files.'
69+
)
70+
argparser.add_argument(
71+
'--overwrite', action='store_true',
72+
help='Overwrite existing mimic4.db file.'
73+
)
74+
args = argparser.parse_args()
75+
76+
# validate that we can find all the files
77+
data_dir = Path(args.data_dir).resolve()
78+
data_files = list(data_dir.rglob('**/*.csv*'))
79+
if not data_files:
80+
print(f"No CSV files found in {data_dir}")
81+
sys.exit()
82+
83+
# remove suffixes from data files -> also lower case tablenames
84+
# creates index aligned array for data files
85+
tablenames = []
86+
for f in data_files:
87+
while f.suffix.lower() in {'.csv', '.gz'}:
88+
f = f.with_suffix('')
89+
tablenames.append(f.stem.lower())
90+
91+
# check that all the expected tables are present
92+
expected_tables = set([t for t in tablenames])
93+
missing_tables = set(_MIMIC_TABLES) - expected_tables
94+
if missing_tables:
95+
print(expected_tables)
96+
print(f"Missing tables: {missing_tables}")
97+
sys.exit()
98+
99+
pt = None
100+
subjects = None
101+
if args.limit > 0:
102+
for f in data_files:
103+
if 'patients' in f.name:
104+
pt = pd.read_csv(f)
105+
break
106+
if pt is None:
107+
raise FileNotFoundError('Unable to find a patients file in current folder.')
108+
109+
pt = pt[['subject_id']].sort_values('subject_id').head(args.limit)
110+
subjects = set(sorted(pt['subject_id'].tolist())[:args.limit])
111+
print(f'Limiting to {len(subjects)} subjects.')
112+
113+
if os.path.exists(DATABASE_NAME):
114+
if args.overwrite:
115+
os.remove(DATABASE_NAME)
29116
else:
30-
# If the file is too large, let's do the work in chunks
31-
for chunk in pd.read_csv(f, chunksize=CHUNKSIZE, low_memory=False):
32-
chunk.to_sql(tablename, connection, if_exists="append")
33-
print("Finished processing {}".format(f))
117+
msg = "File {} already exists.".format(DATABASE_NAME)
118+
print(msg)
119+
sys.exit()
120+
121+
# For a subset of columns, we specify the data types to ensure
122+
# pandas loads the data correctly.
123+
mimic_dtypes = {
124+
"subject_id": pd.Int64Dtype(),
125+
"hadm_id": pd.Int64Dtype(),
126+
"stay_id": pd.Int64Dtype(),
127+
"caregiver_id": pd.Int64Dtype(),
128+
"provider_id": str,
129+
"category": str, # d_hcpcs
130+
"parent_field_ordinal": str,
131+
"pharmacy_id": pd.Int64Dtype(),
132+
"emar_seq": pd.Int64Dtype(),
133+
"poe_seq": pd.Int64Dtype(),
134+
"ndc": str,
135+
"doses_per_24_hrs": pd.Int64Dtype(),
136+
"drg_code": str,
137+
"org_itemid": pd.Int64Dtype(),
138+
"isolate_num": pd.Int64Dtype(),
139+
"quantity": str,
140+
"ab_itemid": pd.Int64Dtype(),
141+
"dilution_text": str,
142+
"warning": pd.Int64Dtype(),
143+
"valuenum": float,
144+
}
145+
146+
row_counts = {t: 0 for t in set(tablenames) | set(_MIMIC_TABLES)}
147+
with sqlite3.Connection(DATABASE_NAME) as connection:
148+
for i, f in enumerate(data_files):
149+
tablename = tablenames[i]
150+
print("Starting processing {}".format(tablename), end='.. ')
151+
if os.path.getsize(f) < THRESHOLD_SIZE:
152+
df = pd.read_csv(f, dtype=mimic_dtypes)
153+
df = process_dataframe(df, subjects=subjects)
154+
df.to_sql(tablename, connection, index=False)
155+
row_counts[tablename] += len(df)
156+
else:
157+
# If the file is too large, let's do the work in chunks
158+
for chunk in pd.read_csv(f, chunksize=CHUNKSIZE, low_memory=False, dtype=mimic_dtypes):
159+
chunk = process_dataframe(chunk)
160+
chunk.to_sql(tablename, connection, if_exists="append", index=False)
161+
row_counts[tablename] += len(chunk)
162+
print("done!")
163+
164+
print("Should be all done! Row counts of loaded data:\n")
165+
166+
print(json.dumps(row_counts, indent=2))
167+
168+
34169

35-
print("Should be all done!")
170+
if __name__ == '__main__':
171+
main()

0 commit comments

Comments
 (0)