1+ from argparse import ArgumentParser
2+ import json
13import os
4+ from pathlib import Path
25import sqlite3
36import sys
4-
5- from glob import glob
7+ import typing as t
68import pandas as pd
79
810DATABASE_NAME = "mimic4.db"
911THRESHOLD_SIZE = 5 * 10 ** 7
1012CHUNKSIZE = 10 ** 6
1113
12- if os .path .exists (DATABASE_NAME ):
13- msg = "File {} already exists." .format (DATABASE_NAME )
14- print (msg )
15- sys .exit ()
16-
17- with sqlite3 .Connection (DATABASE_NAME ) as connection :
18- for f in glob ("**/*.csv*" , recursive = True ):
19- print ("Starting processing {}" .format (f ))
20- folder , filename = os .path .split (f )
21- tablename = filename .lower ()
22- if tablename .endswith ('.gz' ):
23- tablename = tablename [:- 3 ]
24- if tablename .endswith ('.csv' ):
25- tablename = tablename [:- 4 ]
26- if os .path .getsize (f ) < THRESHOLD_SIZE :
27- df = pd .read_csv (f )
28- df .to_sql (tablename , connection )
14+ _MIMIC_TABLES = (
15+ # hospital EHR derived tables
16+ 'admissions' ,
17+ 'd_hcpcs' ,
18+ 'd_icd_diagnoses' ,
19+ 'd_icd_procedures' ,
20+ 'd_labitems' ,
21+ 'diagnoses_icd' ,
22+ 'drgcodes' ,
23+ 'emar' ,
24+ 'emar_detail' ,
25+ 'hcpcsevents' ,
26+ 'labevents' ,
27+ 'microbiologyevents' ,
28+ 'omr' ,
29+ 'patients' ,
30+ 'pharmacy' ,
31+ 'poe' ,
32+ 'poe_detail' ,
33+ 'prescriptions' ,
34+ 'procedures_icd' ,
35+ 'provider' ,
36+ 'services' ,
37+ 'transfers' ,
38+ # ICU derived tables
39+ 'caregiver' ,
40+ 'chartevents' ,
41+ 'd_items' ,
42+ 'datetimeevents' ,
43+ 'icustays' ,
44+ 'ingredientevents' ,
45+ 'inputevents' ,
46+ 'outputevents' ,
47+ 'procedureevents' ,
48+ )
49+
50+ def process_dataframe (df : pd .DataFrame , subjects : t .Optional [t .List [int ]] = None ) -> pd .DataFrame :
51+ for c in df .columns :
52+ if c .endswith ('time' ) or c .endswith ('date' ):
53+ df [c ] = pd .to_datetime (df [c ], format = 'ISO8601' )
54+
55+ if subjects is not None and 'subject_id' in df :
56+ df = df .loc [df ['subject_id' ].isin (subjects )]
57+
58+ return df
59+
60+ def main ():
61+ argparser = ArgumentParser ()
62+ argparser .add_argument (
63+ '--limit' , type = int , default = 0 ,
64+ help = 'Restrict the database to the first N subject_id.'
65+ )
66+ argparser .add_argument (
67+ '--data_dir' , type = str , default = '.' ,
68+ help = 'Path to the directory containing the MIMIC-IV CSV files.'
69+ )
70+ argparser .add_argument (
71+ '--overwrite' , action = 'store_true' ,
72+ help = 'Overwrite existing mimic4.db file.'
73+ )
74+ args = argparser .parse_args ()
75+
76+ # validate that we can find all the files
77+ data_dir = Path (args .data_dir ).resolve ()
78+ data_files = list (data_dir .rglob ('**/*.csv*' ))
79+ if not data_files :
80+ print (f"No CSV files found in { data_dir } " )
81+ sys .exit ()
82+
83+ # remove suffixes from data files -> also lower case tablenames
84+ # creates index aligned array for data files
85+ tablenames = []
86+ for f in data_files :
87+ while f .suffix .lower () in {'.csv' , '.gz' }:
88+ f = f .with_suffix ('' )
89+ tablenames .append (f .stem .lower ())
90+
91+ # check that all the expected tables are present
92+ expected_tables = set ([t for t in tablenames ])
93+ missing_tables = set (_MIMIC_TABLES ) - expected_tables
94+ if missing_tables :
95+ print (expected_tables )
96+ print (f"Missing tables: { missing_tables } " )
97+ sys .exit ()
98+
99+ pt = None
100+ subjects = None
101+ if args .limit > 0 :
102+ for f in data_files :
103+ if 'patients' in f .name :
104+ pt = pd .read_csv (f )
105+ break
106+ if pt is None :
107+ raise FileNotFoundError ('Unable to find a patients file in current folder.' )
108+
109+ pt = pt [['subject_id' ]].sort_values ('subject_id' ).head (args .limit )
110+ subjects = set (sorted (pt ['subject_id' ].tolist ())[:args .limit ])
111+ print (f'Limiting to { len (subjects )} subjects.' )
112+
113+ if os .path .exists (DATABASE_NAME ):
114+ if args .overwrite :
115+ os .remove (DATABASE_NAME )
29116 else :
30- # If the file is too large, let's do the work in chunks
31- for chunk in pd .read_csv (f , chunksize = CHUNKSIZE , low_memory = False ):
32- chunk .to_sql (tablename , connection , if_exists = "append" )
33- print ("Finished processing {}" .format (f ))
117+ msg = "File {} already exists." .format (DATABASE_NAME )
118+ print (msg )
119+ sys .exit ()
120+
121+ # For a subset of columns, we specify the data types to ensure
122+ # pandas loads the data correctly.
123+ mimic_dtypes = {
124+ "subject_id" : pd .Int64Dtype (),
125+ "hadm_id" : pd .Int64Dtype (),
126+ "stay_id" : pd .Int64Dtype (),
127+ "caregiver_id" : pd .Int64Dtype (),
128+ "provider_id" : str ,
129+ "category" : str , # d_hcpcs
130+ "parent_field_ordinal" : str ,
131+ "pharmacy_id" : pd .Int64Dtype (),
132+ "emar_seq" : pd .Int64Dtype (),
133+ "poe_seq" : pd .Int64Dtype (),
134+ "ndc" : str ,
135+ "doses_per_24_hrs" : pd .Int64Dtype (),
136+ "drg_code" : str ,
137+ "org_itemid" : pd .Int64Dtype (),
138+ "isolate_num" : pd .Int64Dtype (),
139+ "quantity" : str ,
140+ "ab_itemid" : pd .Int64Dtype (),
141+ "dilution_text" : str ,
142+ "warning" : pd .Int64Dtype (),
143+ "valuenum" : float ,
144+ }
145+
146+ row_counts = {t : 0 for t in set (tablenames ) | set (_MIMIC_TABLES )}
147+ with sqlite3 .Connection (DATABASE_NAME ) as connection :
148+ for i , f in enumerate (data_files ):
149+ tablename = tablenames [i ]
150+ print ("Starting processing {}" .format (tablename ), end = '.. ' )
151+ if os .path .getsize (f ) < THRESHOLD_SIZE :
152+ df = pd .read_csv (f , dtype = mimic_dtypes )
153+ df = process_dataframe (df , subjects = subjects )
154+ df .to_sql (tablename , connection , index = False )
155+ row_counts [tablename ] += len (df )
156+ else :
157+ # If the file is too large, let's do the work in chunks
158+ for chunk in pd .read_csv (f , chunksize = CHUNKSIZE , low_memory = False , dtype = mimic_dtypes ):
159+ chunk = process_dataframe (chunk )
160+ chunk .to_sql (tablename , connection , if_exists = "append" , index = False )
161+ row_counts [tablename ] += len (chunk )
162+ print ("done!" )
163+
164+ print ("Should be all done! Row counts of loaded data:\n " )
165+
166+ print (json .dumps (row_counts , indent = 2 ))
167+
168+
34169
35- print ("Should be all done!" )
170+ if __name__ == '__main__' :
171+ main ()
0 commit comments